Merge pull request #4668 from microsoft/doc-refactor

51d261e7 · J-shang · GitHub · d63a2ea3 · b469e1c1 · 51d261e7
Unverified Commit 51d261e7 authored Mar 22, 2022 by J-shang Committed by GitHub Mar 22, 2022
20 changed files
--- a/nni/algorithms/compression/v2/pytorch/pruning/movement_pruner.py
+++ b/nni/algorithms/compression/v2/pytorch/pruning/movement_pruner.py
@@ -124,7 +124,21 @@ class WeightScoreTrainerBasedDataCollector(TrainerBasedDataCollector):
 class MovementPruner(BasicPruner):
-    """
+    r"""
+    Movement pruner is an implementation of movement pruning.
+    This is a "fine-pruning" algorithm, which means the masks may change during each fine-tuning step.
+    Each weight element will be scored by the opposite of the sum of the product of weight and its gradient during each step.
+    This means the weight elements moving towards zero will accumulate negative scores, the weight elements moving away from zero will accumulate positive scores.
+    The weight elements with low scores will be masked during inference.
+    The following figure from the paper shows the weight pruning by movement pruning.
+    .. image:: ../../img/movement_pruning.png
+        :target: ../../img/movement_pruning.png
+        :alt:
+    For more details, please refer to `Movement Pruning: Adaptive Sparsity by Fine-Tuning <https://arxiv.org/abs/2005.07683>`__.
    Parameters
    ----------
    model : torch.nn.Module
@@ -158,7 +172,7 @@ class MovementPruner(BasicPruner):
                model.train(mode=training)
    traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
        The traced optimizer instance which the optimizer class is wrapped by nni.trace.
-        E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()).
+        E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
    criterion : Callable[[Tensor, Tensor], Tensor]
        The criterion function used in trainer. Take model output and target value as input, and return the loss.
    training_epochs : int
@@ -171,6 +185,21 @@ class MovementPruner(BasicPruner):
        The number of steps at which sparsity stops growing, note that the sparsity stop growing doesn't mean masks not changed.
        The sparsity after each `optimizer.step()` is:
        total_sparsity * (1 - (1 - (current_step - warm_up_step) / (cool_down_beginning_step - warm_up_step)) ** 3).
+    Examples
+    --------
+        >>> import nni
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import MovementPruner
+        >>> model = ...
+        >>> # make sure you have used nni.trace to wrap the optimizer class before initialize
+        >>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
+        >>> trainer = ...
+        >>> criterion = ...
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> pruner = MovementPruner(model, config_list, trainer, traced_optimizer, criterion, 10, 3000, 27000)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/movement_pruning_glue.py <examples/model_compress/pruning/v2/movement_pruning_glue.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], trainer: Callable[[Module, Optimizer, Callable], None],
                 traced_optimizer: Traceable, criterion: Callable[[Tensor, Tensor], Tensor], training_epochs: int, warm_up_step: int,

--- a/nni/algorithms/compression/v2/pytorch/pruning/tools/base.py
+++ b/nni/algorithms/compression/v2/pytorch/pruning/tools/base.py
@@ -22,15 +22,14 @@ _logger = logging.getLogger(__name__)
 class DataCollector:
    """
    An abstract class for collect the data needed by the compressor.
+    Parameters
+    ----------
+    compressor
+        The compressor binded with this DataCollector.
    """
    def __init__(self, compressor: Compressor):
-        """
-        Parameters
-        ----------
-        compressor
-            The compressor binded with this DataCollector.
-        """
        self.compressor = compressor
    def reset(self):
@@ -242,42 +241,43 @@ class TrainerBasedDataCollector(DataCollector):
 class MetricsCalculator:
    """
    An abstract class for calculate a kind of metrics of the given data.
-    """
-    def __init__(self, dim: Optional[Union[int, List[int]]] = None,
-                 block_sparse_size: Optional[Union[int, List[int]]] = None):
-        """
-        Parameters
-        ----------
-        dim
-            The dimensions that corresponding to the under pruning weight dimensions in collected data.
-            None means one-to-one correspondence between pruned dimensions and data, which equal to set `dim` as all data dimensions.
-            Only these `dim` will be kept and other dimensions of the data will be reduced.
-            Example:
+    Parameters
+    ----------
+    dim
+        The dimensions that corresponding to the under pruning weight dimensions in collected data.
+        None means one-to-one correspondence between pruned dimensions and data, which equal to set `dim` as all data dimensions.
+        Only these `dim` will be kept and other dimensions of the data will be reduced.
-            If you want to prune the Conv2d weight in filter level, and the weight size is (32, 16, 3, 3) [out-channel, in-channel, kernal-size-1, kernal-size-2].
+        Example:
-            Then the under pruning dimensions is [0], which means you want to prune the filter or out-channel.
-                Case 1: Directly collect the conv module weight as data to calculate the metric.
+        If you want to prune the Conv2d weight in filter level, and the weight size is (32, 16, 3, 3) [out-channel, in-channel, kernal-size-1, kernal-size-2].
-                Then the data has size (32, 16, 3, 3).
+        Then the under pruning dimensions is [0], which means you want to prune the filter or out-channel.
-                Mention that the dimension 0 of the data is corresponding to the under pruning weight dimension 0.
-                So in this case, `dim=0` will set in `__init__`.
-                Case 2: Use the output of the conv module as data to calculate the metric.
+            Case 1: Directly collect the conv module weight as data to calculate the metric.
-                Then the data has size (batch_num, 32, feature_map_size_1, feature_map_size_2).
+            Then the data has size (32, 16, 3, 3).
-                Mention that the dimension 1 of the data is corresponding to the under pruning weight dimension 0.
+            Mention that the dimension 0 of the data is corresponding to the under pruning weight dimension 0.
-                So in this case, `dim=1` will set in `__init__`.
+            So in this case, `dim=0` will set in `__init__`.
-            In both of these two case, the metric of this module has size (32,).
+            Case 2: Use the output of the conv module as data to calculate the metric.
-        block_sparse_size
+            Then the data has size (batch_num, 32, feature_map_size_1, feature_map_size_2).
-            This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
+            Mention that the dimension 1 of the data is corresponding to the under pruning weight dimension 0.
-            Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
+            So in this case, `dim=1` will set in `__init__`.
-            Example:
+        In both of these two case, the metric of this module has size (32,).
-            The under pruning weight size is (768, 768), and you want to apply a block sparse on dim=[0] with block size [64, 768],
+    block_sparse_size
-            then you can set block_sparse_size=[64]. The final metric size is (12,).
+        This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
-        """
+        Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
+        Example:
+        The under pruning weight size is (768, 768), and you want to apply a block sparse on dim=[0] with block size [64, 768],
+        then you can set block_sparse_size=[64]. The final metric size is (12,).
+    """
+    def __init__(self, dim: Optional[Union[int, List[int]]] = None,
+                 block_sparse_size: Optional[Union[int, List[int]]] = None):
        self.dim = dim if not isinstance(dim, int) else [dim]
        self.block_sparse_size = block_sparse_size if not isinstance(block_sparse_size, int) else [block_sparse_size]
        if self.block_sparse_size is not None:
@@ -307,36 +307,35 @@ class MetricsCalculator:
 class SparsityAllocator:
    """
    An abstract class for allocate mask based on metrics.
+    Parameters
+    ----------
+    pruner
+        The pruner that binded with this `SparsityAllocator`.
+    dim
+        The under pruning weight dimensions, which metric size should equal to the under pruning weight size on these dimensions.
+        None means one-to-one correspondence between pruned dimensions and metric, which equal to set `dim` as all under pruning weight dimensions.
+        The mask will expand to the weight size depend on `dim`.
+        Example:
+        The under pruning weight has size (2, 3, 4), and `dim=1` means the under pruning weight dimension is 1.
+        Then the metric should have a size (3,), i.e., `metric=[0.9, 0.1, 0.8]`.
+        Assuming by some kind of `SparsityAllocator` get the mask on weight dimension 1 `mask=[1, 0, 1]`,
+        then the dimension mask will expand to the final mask `[[[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]]]`.
+    block_sparse_size
+        This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
+        Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
+        Example:
+        The metric size is (12,), and block_sparse_size=[64], then the mask will expand to (768,) at first before expand with `dim`.
+    continuous_mask
+        Inherit the mask already in the wrapper if set True.
    """
    def __init__(self, pruner: Compressor, dim: Optional[Union[int, List[int]]] = None,
                 block_sparse_size: Optional[Union[int, List[int]]] = None, continuous_mask: bool = True):
-        """
-        Parameters
-        ----------
-        pruner
-            The pruner that binded with this `SparsityAllocator`.
-        dim
-            The under pruning weight dimensions, which metric size should equal to the under pruning weight size on these dimensions.
-            None means one-to-one correspondence between pruned dimensions and metric, which equal to set `dim` as all under pruning weight dimensions.
-            The mask will expand to the weight size depend on `dim`.
-            Example:
-            The under pruning weight has size (2, 3, 4), and `dim=1` means the under pruning weight dimension is 1.
-            Then the metric should have a size (3,), i.e., `metric=[0.9, 0.1, 0.8]`.
-            Assuming by some kind of `SparsityAllocator` get the mask on weight dimension 1 `mask=[1, 0, 1]`,
-            then the dimension mask will expand to the final mask `[[[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]]]`.
-        block_sparse_size
-            This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
-            Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
-            Example:
-            The metric size is (12,), and block_sparse_size=[64], then the mask will expand to (768,) at first before expand with `dim`.
-        continuous_mask
-            Inherit the mask already in the wrapper if set True.
-        """
        self.pruner = pruner
        self.dim = dim if not isinstance(dim, int) else [dim]
        self.block_sparse_size = block_sparse_size if not isinstance(block_sparse_size, int) else [block_sparse_size]

--- a/nni/algorithms/compression/v2/pytorch/utils/pruning.py
+++ b/nni/algorithms/compression/v2/pytorch/utils/pruning.py
@@ -200,6 +200,17 @@ def compute_sparsity(origin_model: Module, compact_model: Module, compact_model_
    The compact model is the origin model after pruning,
    and it may have different structure with origin_model cause of speed up.
+    Parameters
+    ----------
+    origin_model : torch.nn.Module
+        The original un-pruned model.
+    compact_model : torch.nn.Module
+        The model after speed up or original model.
+    compact_model_masks: Dict[str, Dict[str, Tensor]]
+        The masks applied on the compact model, if the original model have been speed up, this should be {}.
+    config_list : List[Dict]
+        The config_list used by pruning the original model.
    Returns
    -------
    Tuple[List[Dict], List[Dict], List[Dict]]

--- a/nni/algorithms/hpo/batch_tuner.py
+++ b/nni/algorithms/hpo/batch_tuner.py
@@ -20,27 +20,64 @@ LOGGER = logging.getLogger('batch_tuner_AutoML')
 class BatchTuner(Tuner):
    """
-    BatchTuner is tuner will running all the configure that user want to run batchly.
+    Batch tuner is a special tuner that allows users to simply provide several hyperparameter sets,
+    and it will evaluate each set.
+    Batch tuner does **not** support standard search space.
+    Search space of batch tuner looks like a single ``choice`` in standard search space,
+    but it has different meaning.
+    Consider following search space:
+    .. code-block::
+        'combine_params': {
+            '_type': 'choice',
+            '_value': [
+                {'x': 0, 'y': 1},
+                {'x': 1, 'y': 2},
+                {'x': 1, 'y': 3},
+            ]
+        }
+    Batch tuner will generate following 4 hyperparameter sets:
+    1. {'x': 0, 'y': 1}
+    2. {'x': 1, 'y': 2}
+    3. {'x': 1, 'y': 3}
+    If this search space was used with grid search tuner, it would instead generate:
+    1. {'combine_params': {'x': 0, 'y': 1 }}
+    2. {'combine_params': {'x': 1, 'y': 2 }}
+    3. {'combine_params': {'x': 1, 'y': 3 }}
    Examples
    --------
-    The search space only be accepted like:
-        ::
+    .. code-block::
-            {'combine_params':
+        config.search_space = {
-                { '_type': 'choice',
+            'combine_params': {
-                            '_value': '[{...}, {...}, {...}]',
+                '_type': 'choice',
-                }
+                '_value': [
+                    {'optimizer': 'Adam', 'learning_rate': 0.001},
+                    {'optimizer': 'Adam', 'learning_rate': 0.0001},
+                    {'optimizer': 'Adam', 'learning_rate': 0.00001},
+                    {'optimizer': 'SGD', 'learning_rate': 0.01},
+                    {'optimizer': 'SGD', 'learning_rate': 0.005},
+                ]
            }
+        }
+        config.tuner.name = 'BatchTuner'
    """
    def __init__(self):
        self._count = -1
        self._values = []
-    def is_valid(self, search_space):
+    def _is_valid(self, search_space):
        """
        Check the search space is valid: only contains 'choice' type
@@ -70,27 +107,10 @@ class BatchTuner(Tuner):
        return None
    def update_search_space(self, search_space):
-        """Update the search space
-        Parameters
-        ----------
-        search_space : dict
-        """
        validate_search_space(search_space, ['choice'])
-        self._values = self.is_valid(search_space)
+        self._values = self._is_valid(search_space)
    def generate_parameters(self, parameter_id, **kwargs):
-        """Returns a dict of trial (hyper-)parameters, as a serializable object.
-        Parameters
-        ----------
-        parameter_id : int
-        Returns
-        -------
-        dict
-            A candidate parameter group.
-        """
        self._count += 1
        if self._count > len(self._values) - 1:
            raise nni.NoMoreTrialError('no more parameters now.')
@@ -100,13 +120,6 @@ class BatchTuner(Tuner):
        pass
    def import_data(self, data):
-        """Import additional data for tuning
-        Parameters
-        ----------
-        data:
-            a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
-        """
        if not self._values:
            LOGGER.info("Search space has not been initialized, skip this data import")
            return

--- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
+++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
@@ -249,20 +249,52 @@ class BOHBClassArgsValidator(ClassArgsValidator):
 class BOHB(MsgDispatcherBase):
    """
-    BOHB performs robust and efficient hyperparameter optimization
+    `BOHB <https://arxiv.org/abs/1807.01774>`__ is a robust and efficient hyperparameter tuning algorithm at scale.
-    at scale by combining the speed of Hyperband searches with the
+    BO is an abbreviation for "Bayesian Optimization" and HB is an abbreviation for "Hyperband".
-    guidance and guarantees of convergence of Bayesian Optimization.
-    Instead of sampling new configurations at random, BOHB uses
+    BOHB relies on HB (Hyperband) to determine how many configurations to evaluate with which budget,
-    kernel density estimators to select promising candidates.
+    but it replaces the random selection of configurations at the beginning of each HB iteration
+    by a model-based search (Bayesian Optimization).
+    Once the desired number of configurations for the iteration is reached,
+    the standard successive halving procedure is carried out using these configurations.
+    It keeps track of the performance of all function evaluations g(x, b) of configurations x
+    on all budgets b to use as a basis for our models in later iterations.
+    Please refer to the paper :footcite:t:`falkner2018bohb` for detailed algorithm.
+    Note that BOHB needs additional installation using the following command:
+    .. code-block:: bash
+        pip install nni[BOHB]
+    Examples
+    --------
+    .. code-block::
+        config.advisor.name = 'BOHB'
+        config.advisor.class_args = {
+            'optimize_mode': 'maximize',
+            'min_budget': 1,
+            'max_budget': 27,
+            'eta': 3,
+            'min_points_in_model': 7,
+            'top_n_percent': 15,
+            'num_samples': 64,
+            'random_fraction': 0.33,
+            'bandwidth_factor': 3.0,
+            'min_bandwidth': 0.001
+        }
    Parameters
    ----------
    optimize_mode: str
-        optimize mode, 'maximize' or 'minimize'
+        Optimize mode, 'maximize' or 'minimize'.
    min_budget: float
-        The smallest budget to consider. Needs to be positive!
+        The smallest budget to assign to a trial job, (budget can be the number of mini-batches or epochs).
+        Needs to be positive.
    max_budget: float
-        The largest budget to consider. Needs to be larger than min_budget!
+        The largest budget to assign to a trial job. Needs to be larger than min_budget.
        The budgets will be geometrically distributed
        :math:`a^2 + b^2 = c^2 \\sim \\eta^k` for :math:`k\\in [0, 1, ... , num\\_subsets - 1]`.
    eta: int
@@ -271,21 +303,102 @@ class BOHB(MsgDispatcherBase):
        1/eta of them 'advances' to the next round.
        Must be greater or equal to 2.
    min_points_in_model: int
-        number of observations to start building a KDE. Default 'None' means
+        Number of observations to start building a KDE. Default 'None' means dim+1;
-        dim+1, the bare minimum.
+        when the number of completed trials in this budget is equal to or larger than ``max{dim+1, min_points_in_model}``,
+        BOHB will start to build a KDE model of this budget then use said KDE model to guide configuration selection.
+        Needs to be positive. (dim means the number of hyperparameters in search space)
    top_n_percent: int
-        percentage ( between 1 and 99, default 15) of the observations that are considered good.
+        Percentage (between 1 and 99, default 15) of the observations which are considered good.
+        Good points and bad points are used for building KDE models.
+        For example, if you have 100 observed trials and top_n_percent is 15,
+        then the top 15% of points will be used for building the good points models "l(x)".
+        The remaining 85% of points will be used for building the bad point models "g(x)".
    num_samples: int
-        number of samples to optimize EI (default 64)
+        Number of samples to optimize EI (default 64).
+        In this case, it will sample "num_samples" points and compare the result of l(x)/g(x).
+        Then it will return the one with the maximum l(x)/g(x) value as the next configuration
+        if the optimize_mode is ``maximize``. Otherwise, it returns the smallest one.
    random_fraction: float
-    fraction of purely random configurations that are sampled from the
+        Fraction of purely random configurations that are sampled from the prior without the model.
-        prior without the model.
    bandwidth_factor: float
-        to encourage diversity, the points proposed to optimize EI, are sampled
+        To encourage diversity, the points proposed to optimize EI are sampled
-        from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3)
+        from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3).
+        It is suggested to use the default value if you are not familiar with KDE.
    min_bandwidth: float
-        to keep diversity, even when all (good) samples have the same value for one of the parameters,
+        To keep diversity, even when all (good) samples have the same value for one of the parameters,
-        a minimum bandwidth (Default: 1e-3) is used instead of zero.
+        a minimum bandwidth (default: 1e-3) is used instead of zero.
+        It is suggested to use the default value if you are not familiar with KDE.
+    config_space: str
+        Directly use a .pcs file serialized by `ConfigSpace <https://automl.github.io/ConfigSpace/>` in "pcs new" format.
+        In this case, search space file (if provided in config) will be ignored.
+        Note that this path needs to be an absolute path. Relative path is currently not supported.
+    Notes
+    -----
+    Below is the introduction of the BOHB process separated in two parts:
+    **The first part HB (Hyperband).**
+    BOHB follows Hyperband’s way of choosing the budgets and continue to use SuccessiveHalving.
+    For more details, you can refer to the :class:`nni.algorithms.hpo.hyperband_advisor.Hyperband`
+    and the `reference paper for Hyperband <https://arxiv.org/abs/1603.06560>`__.
+    This procedure is summarized by the pseudocode below.
+    .. image:: ../../img/bohb_1.png
+        :scale: 80 %
+        :align: center
+    **The second part BO (Bayesian Optimization)**
+    The BO part of BOHB closely resembles TPE with one major difference:
+    It opted for a single multidimensional KDE compared to the hierarchy of one-dimensional KDEs used in TPE
+    in order to better handle interaction effects in the input space.
+    Tree Parzen Estimator(TPE): uses a KDE (kernel density estimator) to model the densities.
+    .. image:: ../../img/bohb_2.png
+        :scale: 80 %
+        :align: center
+    To fit useful KDEs, we require a minimum number of data points Nmin;
+    this is set to d + 1 for our experiments, where d is the number of hyperparameters.
+    To build a model as early as possible, we do not wait until Nb = \|Db\|,
+    where the number of observations for budget b is large enough to satisfy q · Nb ≥ Nmin.
+    Instead, after initializing with Nmin + 2 random configurations, we choose the
+    best and worst configurations, respectively, to model the two densities.
+    Note that it also samples a constant fraction named **random fraction** of the configurations uniformly at random.
+    .. image:: ../../img/bohb_3.png
+        :scale: 80 %
+        :align: center
+    .. image:: ../../img/bohb_6.jpg
+        :scale: 65 %
+        :align: center
+    **The above image shows the workflow of BOHB.**
+    Here set max_budget = 9, min_budget = 1, eta = 3, others as default.
+    In this case, s_max = 2, so we will continuously run the {s=2, s=1, s=0, s=2, s=1, s=0, ...} cycle.
+    In each stage of SuccessiveHalving (the orange box), it will pick the top 1/eta configurations and run them again with more budget,
+    repeating the SuccessiveHalving stage until the end of this iteration.
+    At the same time, it collects the configurations, budgets and final metrics of each trial
+    and use these to build a multidimensional KDEmodel with the key "budget".
+    Multidimensional KDE is used to guide the selection of configurations for the next iteration.
+    The sampling procedure (using Multidimensional KDE to guide selection) is summarized by the pseudocode below.
+    .. image:: ../../img/bohb_4.png
+        :scale: 80 %
+        :align: center
+    **Here is a simple experiment which tunes MNIST with BOHB.**
+    Code implementation: :githublink:`examples/trials/mnist-advisor <examples/trials/mnist-advisor>`
+    The following is the experimental final results:
+    .. image:: ../../img/bohb_5.png
+        :scale: 80 %
+        :align: center
+    More experimental results can be found in the `reference paper <https://arxiv.org/abs/1807.01774>`__.
+    It shows that BOHB makes good use of previous results and has a balanced trade-off in exploration and exploitation.
    """
    def __init__(self,

--- a/nni/algorithms/hpo/curvefitting_assessor/curvefitting_assessor.py
+++ b/nni/algorithms/hpo/curvefitting_assessor/curvefitting_assessor.py
@@ -22,18 +22,52 @@ class CurvefittingClassArgsValidator(ClassArgsValidator):
        }).validate(kwargs)
 class CurvefittingAssessor(Assessor):
-    """CurvefittingAssessor uses learning curve fitting algorithm to predict the learning curve performance in the future.
+    """
+    CurvefittingAssessor uses learning curve fitting algorithm to predict the learning curve performance in the future.
+    The intermediate result **must** be accuracy. Curve fitting does not support minimizing loss.
+    Curve fitting assessor is an LPA (learning, predicting, assessing) algorithm.
    It stops a pending trial X at step S if the trial's forecast result at target step is convergence and lower than the
    best performance in the history.
+    Paper: `Speeding up Automatic Hyperparameter Optimization of Deep Neural Networks by Extrapolation of Learning Curves
+    <https://ml.informatik.uni-freiburg.de/wp-content/uploads/papers/15-IJCAI-Extrapolation_of_Learning_Curves.pdf>`__
+    Examples
+    --------
+    .. code-block::
+        config.assessor.name = 'Curvefitting'
+        config.tuner.class_args = {
+            'epoch_num': 20,
+            'start_step': 6,
+            'threshold': 9,
+            'gap': 1,
+        }
    Parameters
    ----------
    epoch_num : int
-        The total number of epoch
+        The total number of epochs.
+        We need to know the number of epochs to determine which points we need to predict.
    start_step : int
-        only after receiving start_step number of reported intermediate results
+        A trial is determined to be stopped or not only after receiving start_step number of intermediate results.
    threshold : float
-        The threshold that we decide to early stop the worse performance curve.
+        The threshold that we use to decide to early stop the worst performance curve.
+        For example: if threshold = 0.95, and the best performance in the history is 0.9,
+        then we will stop the trial who's predicted value is lower than 0.95 * 0.9 = 0.855.
+    gap : int
+        The gap interval between assessor judgements.
+        For example: if gap = 2, start_step = 6,
+        then we will assess the result when we get 6, 8, 10, 12, ... intermediate results.
    """
    def __init__(self, epoch_num=20, start_step=6, threshold=0.95, gap=1):
@@ -56,15 +90,6 @@ class CurvefittingAssessor(Assessor):
        logger.info('Successfully initials the curvefitting assessor')
    def trial_end(self, trial_job_id, success):
-        """update the best performance of completed trial job
-        Parameters
-        ----------
-        trial_job_id : int
-            trial job id
-        success : bool
-            True if succssfully finish the experiment, False otherwise
-        """
        if success:
            if self.set_best_performance:
                self.completed_best_performance = max(self.completed_best_performance, self.trial_history[-1])
@@ -76,25 +101,6 @@ class CurvefittingAssessor(Assessor):
            logger.info('No need to update, trial job id: %s', trial_job_id)
    def assess_trial(self, trial_job_id, trial_history):
-        """assess whether a trial should be early stop by curve fitting algorithm
-        Parameters
-        ----------
-        trial_job_id : int
-            trial job id
-        trial_history : list
-            The history performance matrix of each trial
-        Returns
-        -------
-        bool
-            AssessResult.Good or AssessResult.Bad
-        Raises
-        ------
-        Exception
-            unrecognize exception in curvefitting_assessor
-        """
        scalar_trial_history = extract_scalar_history(trial_history)
        self.trial_history = scalar_trial_history
        if not self.set_best_performance:

--- a/nni/algorithms/hpo/dngo_tuner.py
+++ b/nni/algorithms/hpo/dngo_tuner.py
@@ -44,7 +44,20 @@ def _random_config(search_space, random_state):
 class DNGOTuner(Tuner):
+    """
+    Use neural networks as an alternative to GPs to model distributions over functions in bayesian optimization.
+    Parameters
+    ----------
+    optimize : maximize | minimize, default = maximize
+        If 'maximize', the tuner will target to maximize metrics. If 'minimize', the tuner will target to minimize metrics.
+    sample_size : int, default = 1000
+        Number of samples to select in each iteration. The best one will be picked from the samples as the next trial.
+    trials_per_update : int, default = 20
+        Number of trials to collect before updating the model.
+    num_epochs_per_training : int, default = 500
+        Number of epochs to train DNGO model.
+    """
    def __init__(self, optimize_mode='maximize', sample_size=1000, trials_per_update=20, num_epochs_per_training=500):
        self.searchspace_json = None
        self.random_state = None

--- a/nni/algorithms/hpo/evolution_tuner.py
+++ b/nni/algorithms/hpo/evolution_tuner.py
@@ -4,6 +4,7 @@
 """
 evolution_tuner.py
 """
+from __future__ import annotations
 import copy
 import random
@@ -22,28 +23,19 @@ logger = logging.getLogger(__name__)
 class Individual:
    """
-    Indicidual class to store the indv info.
+    Individual class to store the indv info.
-    Attributes
+    Parameters
    ----------
-    config : str
+    config : str, default = None
        Search space.
-    info : str
+    info : str, default = None
        The str to save information of individual.
-    result : float
+    result : float, None = None
        The final metric of a individual.
    """
    def __init__(self, config=None, info=None, result=None):
-        """
-        Parameters
-        ----------
-        config : str
-            A config to represent a group of parameters.
-        info : str
-        result : float
-        save_dir : str
-        """
        self.config = config
        self.result = result
        self.info = info
@@ -61,18 +53,36 @@ class EvolutionClassArgsValidator(ClassArgsValidator):
 class EvolutionTuner(Tuner):
    """
-    EvolutionTuner is tuner using navie evolution algorithm.
+    Naive Evolution comes from `Large-Scale Evolution of Image Classifiers <https://arxiv.org/pdf/1703.01041.pdf>`__
+    It randomly initializes a population based on the search space.
+    For each generation, it chooses better ones and does some mutation.
+    (e.g., changes a hyperparameter, adds/removes one layer, etc.) on them to get the next generation.
+    Naive Evolution requires many trials to works but it’s very simple and it’s easily expanded with new features.
+    Examples
+    --------
+    .. code-block::
+        config.tuner.name = 'Evolution'
+        config.tuner.class_args = {
+                'optimize_mode': 'maximize',
+                'population_size': 100
+        }
+    Parameters
+    ----------
+    optimize_mode: str
+        Optimize mode, 'maximize' or 'minimize'.
+        If 'maximize', the tuner will try to maximize metrics. If 'minimize', the tuner will try to minimize metrics.
+    population_size: int
+        The initial size of the population (trial num) in the evolution tuner(default=32).
+        The larger population size, the better evolution performance.
+        It's suggested that ``population_size`` be much larger than ``concurrency`` so users can get the most out of the algorithm.
+        And at least ``concurrency``, or the tuner will fail on its first generation of parameters.
    """
-    def __init__(self, optimize_mode="maximize", population_size=32):
+    def __init__(self, optimize_mode='maximize', population_size=32):
-        """
-        Parameters
-        ----------
-        optimize_mode : str, default 'maximize'
-        population_size : int
-            initial population size. The larger population size,
-        the better evolution performance.
-        """
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.population_size = population_size
@@ -89,11 +99,11 @@ class EvolutionTuner(Tuner):
    def update_search_space(self, search_space):
        """
        Update search space.
        Search_space contains the information that user pre-defined.
        Parameters
        ----------
        search_space : dict
        """
        self.searchspace_json = search_space
@@ -109,8 +119,10 @@ class EvolutionTuner(Tuner):
        """
        To deal with trial failure. If a trial fails,
        random generate the parameters and add into the population.
        Parameters
        ----------
        parameter_id : int
            Unique identifier for hyper-parameters used by this trial.
        success : bool
@@ -136,12 +148,15 @@ class EvolutionTuner(Tuner):
    def generate_multiple_parameters(self, parameter_id_list, **kwargs):
        """
        Returns multiple sets of trial (hyper-)parameters, as iterable of serializable objects.
        Parameters
        ----------
        parameter_id_list : list of int
            Unique identifiers for each set of requested hyper-parameters.
        **kwargs
            Not used
        Returns
        -------
        list
@@ -182,12 +197,13 @@ class EvolutionTuner(Tuner):
        Parameters
        ----------
        parameter_id : int
        Returns
        -------
        dict
-            A group of candaidte parameters that evolution tuner generated.
+            A group of candidate parameters that evolution tuner generated.
        """
        pos = -1
@@ -234,10 +250,12 @@ class EvolutionTuner(Tuner):
        Parameters
        ----------
        parameter_id : int
        Returns
        -------
        dict
            One newly generated configuration.
        """
@@ -258,6 +276,7 @@ class EvolutionTuner(Tuner):
        Parameters
        ----------
        parameter_id : int
        parameters : dict
        value : dict/float

--- a/nni/algorithms/hpo/gp_tuner/gp_tuner.py
+++ b/nni/algorithms/hpo/gp_tuner/gp_tuner.py
@@ -41,29 +41,77 @@ class GPClassArgsValidator(ClassArgsValidator):
 class GPTuner(Tuner):
    """
-    GPTuner is a Bayesian Optimization method where Gaussian Process is used for modeling loss functions.
+    GPTuner is a Bayesian Optimization method where Gaussian Process
+    is used for modeling loss functions.
+    Bayesian optimization works by constructing a posterior distribution of functions
+    (a Gaussian Process) that best describes the function you want to optimize.
+    As the number of observations grows, the posterior distribution improves,
+    and the algorithm becomes more certain of which regions in parameter space
+    are worth exploring and which are not.
+    GPTuner is designed to minimize/maximize the number of steps required to find
+    a combination of parameters that are close to the optimal combination.
+    To do so, this method uses a proxy optimization problem (finding the maximum of
+    the acquisition function) that, albeit still a hard problem, is cheaper
+    (in the computational sense) to solve, and it's amenable to common tools.
+    Therefore, Bayesian Optimization is suggested for situations where sampling the function
+    to be optimized is very expensive.
+    Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
+    ``randint``, ``uniform``, ``quniform``, ``loguniform``, ``qloguniform``, and numerical ``choice``.
+    This optimization approach is described in Section 3 of the paper
+    `Algorithms for Hyper-Parameter Optimization <https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf>`__
+    ( :footcite:t:`bergstra2011algorithms` ).
+    Examples
+    --------
+    .. code-block::
+        config.tuner.name = 'GPTuner'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize',
+            'utility': 'ei',
+            'kappa': 5.0,
+            'xi': 0.0,
+            'nu': 2.5,
+            'alpha': 1e-6,
+            'cold_start_num': 10,
+            'selection_num_warm_up': 100000,
+            'selection_num_starting_points': 250
+        }
    Parameters
    ----------
    optimize_mode : str
-        optimize mode, 'maximize' or 'minimize', by default 'maximize'
+        Optimize mode, 'maximize' or 'minimize', by default 'maximize'
    utility : str
-        utility function (also called 'acquisition funcition') to use, which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
+        Utility function (also called 'acquisition funcition') to use,
+        which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
    kappa : float
-        value used by utility function 'ucb'. The bigger kappa is, the more the tuner will be exploratory. By default 5.
+        Value used by utility function 'ucb'. The bigger kappa is,
+        the more the tuner will be exploratory. By default 5.
    xi : float
-        used by utility function 'ei' and 'poi'. The bigger xi is, the more the tuner will be exploratory. By default 0.
+        Used by utility function 'ei' and 'poi'. The bigger xi is,
+        the more the tuner will be exploratory. By default 0.
    nu : float
-        used to specify Matern kernel. The smaller nu, the less smooth the approximated function is. By default 2.5.
+        Used to specify Matern kernel. The smaller nu,
+        the less smooth the approximated function is. By default 2.5.
    alpha : float
-        Used to specify Gaussian Process Regressor. Larger values correspond to increased noise level in the observations.
+        Used to specify Gaussian Process Regressor.
+        Larger values correspond to increased noise level in the observations.
        By default 1e-6.
    cold_start_num : int
-        Number of random exploration to perform before Gaussian Process. By default 10.
+        Number of random exploration to perform before Gaussian Process.
+        By default 10.
    selection_num_warm_up : int
-        Number of random points to evaluate for getting the point which maximizes the acquisition function. By default 100000
+        Number of random points to evaluate for getting the point which
+        maximizes the acquisition function. By default 100000
    selection_num_starting_points : int
-        Number of times to run L-BFGS-B from a random starting point after the warmup. By default 250.
+        Number of times to run L-BFGS-B from a random starting point after the warmup.
+        By default 250.
    """
    def __init__(self, optimize_mode="maximize", utility='ei', kappa=5, xi=0, nu=2.5, alpha=1e-6, cold_start_num=10,

--- a/nni/algorithms/hpo/gridsearch_tuner.py
+++ b/nni/algorithms/hpo/gridsearch_tuner.py
@@ -2,14 +2,10 @@
 # Licensed under the MIT license.
 """
-Grid search tuner for hyper-parameter optimization.
+Grid search tuner.
 For categorical parameters this tuner fully explore all combinations.
 For numerical parameters it samples them at progressively decreased intervals.
-Use this tuner if you have abundant resource and want to find strictly optimal parameters.
-Grid search tuner has no argument.
 """
 __all__ = ['GridSearchTuner']
@@ -63,6 +59,35 @@ _logger = logging.getLogger('nni.tuner.gridsearch')
 ##
 class GridSearchTuner(Tuner):
+    """
+    Grid search tuner divides search space into evenly spaced grid, and performs brute-force traverse.
+    Recommended when the search space is small, or if you want to find strictly optimal hyperparameters.
+    **Implementation**
+    The original grid search approach performs an exhaustive search through a space consists of ``choice`` and ``randint``.
+    NNI's implementation extends grid search to support all search spaces types.
+    When the search space contains continuous parameters like ``normal`` and ``loguniform``,
+    grid search tuner works in following steps:
+    1. Divide the search space into a grid.
+    2. Perform an exhaustive searth through the grid.
+    3. Subdivide the grid into a finer-grained new grid.
+    4. Goto step 2, until experiment end.
+    As a deterministic algorithm, grid search has no argument.
+    Examples
+    --------
+    .. code-block::
+        config.tuner.name = 'GridSearch'
+    """
    def __init__(self):
        self.space = None

--- a/nni/algorithms/hpo/hyperband_advisor.py
+++ b/nni/algorithms/hpo/hyperband_advisor.py
@@ -105,7 +105,8 @@ def json2parameter(ss_spec, random_state):
 class Bracket():
-    """A bracket in Hyperband, all the information of a bracket is managed by an instance of this class
+    """
+    A bracket in Hyperband, all the information of a bracket is managed by an instance of this class
    Parameters
    ----------
@@ -267,24 +268,136 @@ class HyperbandClassArgsValidator(ClassArgsValidator):
 class Hyperband(MsgDispatcherBase):
    """
-    Hyperband inherit from MsgDispatcherBase rather than Tuner, because it integrates both tuner's functions and assessor's functions.
+    `Hyperband <https://arxiv.org/pdf/1603.06560.pdf>`__ is a multi-fidelity hyperparameter tuning algorithm
-    This is an implementation that could fully leverage available resources or follow the algorithm process,
+    based on successive halving.
-    i.e., high parallelism or serial.
-    A single execution of Hyperband takes a finite budget of (s_max + 1)B.
+    The basic idea of Hyperband is to create several buckets,
+    each having ``n`` randomly generated hyperparameter configurations,
+    each configuration using ``r`` resources (e.g., epoch number, batch number).
+    After the ``n`` configurations are finished, it chooses the top ``n/eta`` configurations
+    and runs them using increased ``r*eta`` resources.
+    At last, it chooses the best configuration it has found so far.
+    Please refer to the paper :footcite:t:`li2017hyperband` for detailed algorithm.
+    Examples
+    --------
+    .. code-block::
+        config.advisor.name = 'Hyperband'
+        config.advisor.class_args = {
+            'optimize_mode': 'maximize',
+            'R': 60,
+            'eta': 3
+        }
+    Note that once you use Advisor, you are not allowed to add a Tuner and Assessor spec in the config file.
+    When Hyperband is used, the dict returned by :func:`nni.get_next_parameter` one more key
+    called ``TRIAL_BUDGET`` besides the hyperparameters and their values.
+    **With this TRIAL_BUDGET, users can control in trial code how long a trial runs by following
+    the suggested trial budget from Hyperband.** ``TRIAL_BUDGET`` is a relative number,
+    users can interpret them as number of epochs, number of mini-batches, running time, etc.
+    Here is a concrete example of ``R=81`` and ``eta=3``:
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+        * -
+          - s=4
+          - s=3
+          - s=2
+          - s=1
+          - s=0
+        * - i
+          - n r
+          - n r
+          - n r
+          - n r
+          - n r
+        * - 0
+          - 81 1
+          - 27 3
+          - 9 9
+          - 6 27
+          - 5 81
+        * - 1
+          - 27 3
+          - 9 9
+          - 3 27
+          - 2 81
+          -
+        * - 2
+          - 9 9
+          - 3 27
+          - 1 81
+          -
+          -
+        * - 3
+          - 3 27
+          - 1 81
+          -
+          -
+          -
+        * - 4
+          - 1 81
+          -
+          -
+          -
+          -
+    ``s`` means bucket, ``n`` means the number of configurations that are generated,
+    the corresponding ``r`` means how many budgets these configurations run.
+    ``i`` means round, for example, bucket 4 has 5 rounds, bucket 3 has 4 rounds.
+    A complete example can be found :githublink:`examples/trials/mnist-advisor`.
    Parameters
    ----------
+    optimize_mode: str
+        Optimize mode, 'maximize' or 'minimize'.
    R: int
-        the maximum amount of resource that can be allocated to a single configuration
+        The maximum amount of budget that can be allocated to a single configuration.
+        Here, trial budget could mean the number of epochs, number of mini-batches, etc.,
+        depending on how users interpret it.
+        Each trial should use ``TRIAL_BUDGET`` to control how long it runs.
    eta: int
-        the variable that controls the proportion of configurations discarded in each round of SuccessiveHalving
+        The variable that controls the proportion of configurations discarded in each round of SuccessiveHalving.
-    optimize_mode: str
+        ``1/eta`` configurations will survive and rerun using more budgets in each round.
-        optimize mode, 'maximize' or 'minimize'
    exec_mode: str
-        execution mode, 'serial' or 'parallelism'
+        Execution mode, 'serial' or 'parallelism'.
+        If 'parallelism', the tuner will try to use available resources to start new bucket immediately.
+        If 'serial', the tuner will only start new bucket after the current bucket is done.
+    Notes
+    -----
+    First, Hyperband an example of how to write an autoML algorithm based on MsgDispatcherBase,
+    rather than based on Tuner and Assessor. Hyperband is implemented in this way
+    because it integrates the functions of both Tuner and Assessor,thus, we call it Advisor.
+    Second, this implementation fully leverages Hyperband's internal parallelism.
+    Specifically, the next bucket is not started strictly after the current bucket.
+    Instead, it starts when there are available resources. If you want to use full parallelism mode,
+    set ``exec_mode`` to ``parallelism``.
+    Or if you want to set ``exec_mode`` with ``serial`` according to the original algorithm.
+    In this mode, the next bucket will start strictly after the current bucket.
+    ``parallelism`` mode may lead to multiple unfinished buckets,
+    in contrast, there is at most one unfinished bucket under ``serial`` mode.
+    The advantage of ``parallelism`` mode is to make full use of resources,
+    which may reduce the experiment duration multiple times.
    """
-    def __init__(self, R=60, eta=3, optimize_mode='maximize', exec_mode='parallelism'):
+    def __init__(self, optimize_mode='maximize', R=60, eta=3, exec_mode='parallelism'):
        """B = (s_max + 1)R"""
        super(Hyperband, self).__init__()
        self.R = R

--- a/nni/algorithms/hpo/hyperopt_tuner.py
+++ b/nni/algorithms/hpo/hyperopt_tuner.py
@@ -191,23 +191,31 @@ class HyperoptClassArgsValidator(ClassArgsValidator):
 class HyperoptTuner(Tuner):
    """
-    HyperoptTuner is a tuner which using hyperopt algorithm.
+    NNI wraps `hyperopt <https://github.com/hyperopt/hyperopt>`__ to provide anneal tuner.
+    This simple annealing algorithm begins by sampling from the prior
+    but tends over time to sample from points closer and closer to the best ones observed.
+    This algorithm is a simple variation of random search that leverages smoothness in the response surface.
+    The annealing rate is not adaptive.
+    Examples
+    --------
+    .. code-block::
+        config.tuner.name = 'Anneal'
+        config.tuner.class_args = {
+            'optimize_mode': 'minimize'
+        }
+    Parameters
+    ----------
+    optimze_mode: 'minimize' or 'maximize'
+        Whether optimize to minimize or maximize trial result.
    """
    def __init__(self, algorithm_name, optimize_mode='minimize',
                 parallel_optimize=False, constant_liar_type='min'):
-        """
-        Parameters
-        ----------
-        algorithm_name : str
-            algorithm_name includes "tpe", "random_search" and anneal".
-        optimize_mode : str
-        parallel_optimize : bool
-            More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
-        constant_liar_type : str
-            constant_liar_type including "min", "max" and "mean"
-            More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
-        """
        self.algorithm_name = algorithm_name
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.json = None
@@ -238,15 +246,6 @@ class HyperoptTuner(Tuner):
        raise RuntimeError('Not support tuner algorithm in hyperopt.')
    def update_search_space(self, search_space):
-        """
-        Update search space definition in tuner by search_space in parameters.
-        Will called when first setup experiemnt or update search space in WebUI.
-        Parameters
-        ----------
-        search_space : dict
-        """
        validate_search_space(search_space)
        self.json = search_space
@@ -266,22 +265,11 @@ class HyperoptTuner(Tuner):
        self.rval.catch_eval_exceptions = False
    def generate_parameters(self, parameter_id, **kwargs):
-        """
+        total_params = self._get_suggestion(random_search=False)
-        Returns a set of trial (hyper-)parameters, as a serializable object.
-        Parameters
-        ----------
-        parameter_id : int
-        Returns
-        -------
-        params : dict
-        """
-        total_params = self.get_suggestion(random_search=False)
        # avoid generating same parameter with concurrent trials because hyperopt doesn't support parallel mode
        if total_params in self.total_data.values():
            # but it can cause duplicate parameter rarely
-            total_params = self.get_suggestion(random_search=True)
+            total_params = self._get_suggestion(random_search=True)
        self.total_data[parameter_id] = total_params
        if self.parallel:
@@ -291,17 +279,6 @@ class HyperoptTuner(Tuner):
        return params
    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
-        """
-        Record an observation of the objective function
-        Parameters
-        ----------
-        parameter_id : int
-        parameters : dict
-        value : dict/float
-            if value is dict, it should have "default" key.
-            value is final metrics of the trial.
-        """
        reward = extract_scalar_reward(value)
        # restore the paramsters contains '_index'
        if parameter_id not in self.total_data:
@@ -369,7 +346,7 @@ class HyperoptTuner(Tuner):
                idxs[key] = [new_id]
                vals[key] = [vals[key]]
-        self.miscs_update_idxs_vals(rval_miscs,
+        self._miscs_update_idxs_vals(rval_miscs,
                                    idxs,
                                    vals,
                                    idxs_map={new_id: new_id},
@@ -382,7 +359,7 @@ class HyperoptTuner(Tuner):
        trials.insert_trial_docs([trial])
        trials.refresh()
-    def miscs_update_idxs_vals(self,
+    def _miscs_update_idxs_vals(self,
                               miscs,
                               idxs,
                               vals,
@@ -416,7 +393,7 @@ class HyperoptTuner(Tuner):
                    misc_by_id[tid]['idxs'][key] = [tid]
                    misc_by_id[tid]['vals'][key] = [val]
-    def get_suggestion(self, random_search=False):
+    def _get_suggestion(self, random_search=False):
        """
        get suggestion from hyperopt
@@ -469,14 +446,6 @@ class HyperoptTuner(Tuner):
        return total_params
    def import_data(self, data):
-        """
-        Import additional data for tuning
-        Parameters
-        ----------
-        data:
-            a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
-        """
        _completed_num = 0
        for trial_info in data:
            logger.info("Importing data, current processing progress %s / %s", _completed_num, len(data))

--- a/nni/algorithms/hpo/medianstop_assessor.py
+++ b/nni/algorithms/hpo/medianstop_assessor.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from __future__ import annotations
 import logging
 from schema import Schema, Optional
 from nni import ClassArgsValidator
 from nni.assessor import Assessor, AssessResult
+from nni.typehint import Literal
 from nni.utils import extract_scalar_history
 logger = logging.getLogger('medianstop_Assessor')
@@ -18,18 +21,35 @@ class MedianstopClassArgsValidator(ClassArgsValidator):
        }).validate(kwargs)
 class MedianstopAssessor(Assessor):
-    """MedianstopAssessor is The median stopping rule stops a pending trial X at step S
+    """
+    The median stopping rule stops a pending trial X at step S
    if the trial’s best objective value by step S is strictly worse than the median value
    of the running averages of all completed trials’ objectives reported up to step S
+    Paper: `Google Vizer: A Service for Black-Box Optimization
+    <https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf>`__
+    Examples
+    --------
+    .. code-block::
+        config.assessor.name = 'Medianstop'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize',
+            'start_step': 5
+        }
    Parameters
    ----------
-    optimize_mode : str
+    optimize_mode
-        optimize mode, 'maximize' or 'minimize'
+        Whether optimize to minimize or maximize trial result.
-    start_step : int
+    start_step
-        only after receiving start_step number of reported intermediate results
+        A trial is determined to be stopped or not
+        only after receiving start_step number of reported intermediate results.
    """
-    def __init__(self, optimize_mode='maximize', start_step=0):
+    def __init__(self, optimize_mode: Literal['minimize', 'maximize'] = 'maximize', start_step: int = 0):
        self._start_step = start_step
        self._running_history = dict()
        self._completed_avg_history = dict()
@@ -56,15 +76,6 @@ class MedianstopAssessor(Assessor):
        self._running_history[trial_job_id].extend(trial_history[len(self._running_history[trial_job_id]):])
    def trial_end(self, trial_job_id, success):
-        """trial_end
-        Parameters
-        ----------
-        trial_job_id : int
-            trial job id
-        success : bool
-            True if succssfully finish the experiment, False otherwise
-        """
        if trial_job_id in self._running_history:
            if success:
                cnt = 0
@@ -79,25 +90,6 @@ class MedianstopAssessor(Assessor):
            logger.warning('trial_end: trial_job_id does not exist in running_history')
    def assess_trial(self, trial_job_id, trial_history):
-        """assess_trial
-        Parameters
-        ----------
-        trial_job_id : int
-            trial job id
-        trial_history : list
-            The history performance matrix of each trial
-        Returns
-        -------
-        bool
-            AssessResult.Good or AssessResult.Bad
-        Raises
-        ------
-        Exception
-            unrecognize exception in medianstop_assessor
-        """
        curr_step = len(trial_history)
        if curr_step < self._start_step:
            return AssessResult.Good

--- a/nni/algorithms/hpo/metis_tuner/metis_tuner.py
+++ b/nni/algorithms/hpo/metis_tuner/metis_tuner.py
@@ -46,39 +46,74 @@ class MetisClassArgsValidator(ClassArgsValidator):
 class MetisTuner(Tuner):
    """
-    Metis Tuner
+    `Metis tuner <https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/>`__ offers
+    several benefits over other tuning algorithms.
+    While most tools only predict the optimal configuration, Metis gives you two outputs,
+    a prediction for the optimal configuration and a suggestion for the next trial.
+    No more guess work!
-    More algorithm information you could reference here:
+    While most tools assume training datasets do not have noisy data,
-    https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
+    Metis actually tells you if you need to resample a particular hyper-parameter.
-    Attributes
+    While most tools have problems of being exploitation-heavy,
+    Metis' search strategy balances exploration, exploitation, and (optional) resampling.
+    Metis belongs to the class of sequential model-based optimization (SMBO) algorithms
+    and it is based on the Bayesian Optimization framework. To model the parameter-vs-performance space,
+    Metis uses both a Gaussian Process and GMM. Since each trial can impose a high time cost,
+    Metis heavily trades inference computations with naive trials.
+    At each iteration, Metis does two tasks (refer to :footcite:t:`li2018metis` for details):
+    1. It finds the global optimal point in the Gaussian Process space.
+       This point represents the optimal configuration.
+    2. It identifies the next hyper-parameter candidate.
+       This is achieved by inferring the potential information gain of
+       exploration, exploitation, and resampling.
+    Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
+    ``quniform``, ``uniform``, ``randint``, and numerical ``choice``.
+    Examples
+    --------
+    .. code-block::
+        config.tuner.name = 'MetisTuner'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize'
+        }
+    Parameters
    ----------
-        optimize_mode : str
+    optimize_mode : str
-            optimize_mode is a string that including two mode "maximize" and "minimize"
+        optimize_mode is a string that including two mode "maximize" and "minimize"
-        no_resampling : bool
+    no_resampling : bool
-            True or False.
+        True or False.
-            Should Metis consider re-sampling as part of the search strategy?
+        Should Metis consider re-sampling as part of the search strategy?
-            If you are confident that the training dataset is noise-free,
+        If you are confident that the training dataset is noise-free,
-            then you do not need re-sampling.
+        then you do not need re-sampling.
-        no_candidates : bool
+    no_candidates : bool
-            True or False.
+        True or False.
-            Should Metis suggest parameters for the next benchmark?
+        Should Metis suggest parameters for the next benchmark?
-            If you do not plan to do more benchmarks,
+        If you do not plan to do more benchmarks,
-            Metis can skip this step.
+        Metis can skip this step.
-        selection_num_starting_points : int
+    selection_num_starting_points : int
-            How many times Metis should try to find the global optimal in the search space?
+        How many times Metis should try to find the global optimal in the search space?
-            The higher the number, the longer it takes to output the solution.
+        The higher the number, the longer it takes to output the solution.
-        cold_start_num : int
+    cold_start_num : int
-            Metis need some trial result to get cold start.
+        Metis need some trial result to get cold start.
-            when the number of trial result is less than
+        when the number of trial result is less than
-            cold_start_num, Metis will randomly sample hyper-parameter for trial.
+        cold_start_num, Metis will randomly sample hyper-parameter for trial.
-        exploration_probability: float
+    exploration_probability: float
-            The probability of Metis to select parameter from exploration instead of exploitation.
+        The probability of Metis to select parameter from exploration instead of exploitation.
    """
    def __init__(
@@ -89,43 +124,6 @@ class MetisTuner(Tuner):
            selection_num_starting_points=600,
            cold_start_num=10,
            exploration_probability=0.9):
-        """
-        Parameters
-        ----------
-        optimize_mode : str
-            optimize_mode is a string that including two mode "maximize" and "minimize"
-        no_resampling : bool
-            True or False.
-            Should Metis consider re-sampling as part of the search strategy?
-            If you are confident that the training dataset is noise-free,
-            then you do not need re-sampling.
-        no_candidates : bool
-            True or False.
-            Should Metis suggest parameters for the next benchmark?
-            If you do not plan to do more benchmarks,
-            Metis can skip this step.
-        selection_num_starting_points : int
-            How many times Metis should try to find the global optimal in the search space?
-            The higher the number, the longer it takes to output the solution.
-        cold_start_num : int
-            Metis need some trial result to get cold start.
-            when the number of trial result is less than
-            cold_start_num, Metis will randomly sample hyper-parameter for trial.
-        exploration_probability : float
-            The probability of Metis to select parameter from exploration instead of exploitation.
-        x_bounds : list
-            The constration of parameters.
-        x_types : list
-            The type of parameters.
-        """
        self.samples_x = []
        self.samples_y = []
        self.samples_y_aggregation = []
@@ -141,7 +139,9 @@ class MetisTuner(Tuner):
        self.minimize_constraints_fun = None
        self.minimize_starting_points = None
        self.supplement_data_num = 0
+        # The constration of parameters
        self.x_bounds = []
+        # The type of parameters
        self.x_types = []

--- a/nni/algorithms/hpo/pbt_tuner.py
+++ b/nni/algorithms/hpo/pbt_tuner.py
@@ -170,26 +170,91 @@ class PBTClassArgsValidator(ClassArgsValidator):
        }).validate(kwargs)
 class PBTTuner(Tuner):
+    """
+    Population Based Training (PBT) comes from `Population Based Training of Neural Networks <https://arxiv.org/abs/1711.09846v1>`__.
+    It's a simple asynchronous optimization algorithm which effectively utilizes a fixed computational budget to jointly optimize
+    a population of models and their hyperparameters to maximize performance.
+    Importantly, PBT discovers a schedule of hyperparameter settings rather than following the generally sub-optimal strategy of
+    trying to find a single fixed set to use for the whole course of training.
+    .. image:: ../../img/pbt.jpg
+    PBTTuner initializes a population with several trials (i.e., ``population_size``).
+    There are four steps in the above figure, each trial only runs by one step. How long is one step is controlled by trial code,
+    e.g., one epoch. When a trial starts, it loads a checkpoint specified by PBTTuner and continues to run one step,
+    then saves checkpoint to a directory specified by PBTTuner and exits.
+    The trials in a population run steps synchronously, that is, after all the trials finish the ``i``-th step,
+    the ``(i+1)``-th step can be started. Exploitation and exploration of PBT are executed between two consecutive steps.
+    Two important steps to follow if you are trying to use PBTTuner:
+    1. **Provide checkpoint directory**. Since some trials need to load other trial's checkpoint,
+       users should provide a directory (i.e., ``all_checkpoint_dir``) which is accessible by every trial.
+       It is easy for local mode, users could directly use the default directory or specify any directory on the local machine.
+       For other training services, users should follow :doc:`the document of those training services <../experiment/training_service>`
+       to provide a directory in a shared storage, such as NFS, Azure storage.
+    2. **Modify your trial code**. Before running a step, a trial needs to load a checkpoint,
+       the checkpoint directory is specified in hyper-parameter configuration generated by PBTTuner,
+       i.e., ``params['load_checkpoint_dir']``. Similarly, the directory for saving checkpoint is also included in the configuration,
+       i.e., ``params['save_checkpoint_dir']``. Here, ``all_checkpoint_dir`` is base folder of ``load_checkpoint_dir``
+       and ``save_checkpoint_dir`` whose format is ``all_checkpoint_dir/<population-id>/<step>``.
+       .. code-block:: python
+        params = nni.get_next_parameter()
+        # the path of the checkpoint to load
+        load_path = os.path.join(params['load_checkpoint_dir'], 'model.pth')
+        # load checkpoint from `load_path`
+        ...
+        # run one step
+        ...
+        # the path for saving a checkpoint
+        save_path = os.path.join(params['save_checkpoint_dir'], 'model.pth')
+        # save checkpoint to `save_path`
+        ...
+    The complete example code can be found :githublink:`here <examples/trials/mnist-pbt-tuner-pytorch>`.
+    Parameters
+    ----------
+    optimize_mode : ``maximize`` or ``minimize``, default: ``maximize``
+        If ``maximize``, the tuner will target to maximize metrics. If ``minimize``, the tuner will target to minimize metrics.
+    all_checkpoint_dir : str
+        Directory for trials to load and save checkpoint.
+        If not specified, the directory would be ``~/nni/checkpoint/``.
+        Note that if the experiment is not local mode,
+        users should provide a path in a shared storage which can be accessed by all the trials.
+    population_size : int, default = 10
+        Number of trials in a population. Each step has this number of trials.
+        In our implementation, one step is running each trial by specific training epochs set by users.
+    factor : float, default = (1.2, 0.8)
+        Factors for perturbation of hyperparameters.
+    resample_probability : float, default = 0.25
+        Probability for resampling.
+    fraction : float, default = 0.2
+        Fraction for selecting bottom and top trials.
+    Examples
+    --------
+    Below is an example of PBTTuner configuration in experiment config file.
+    .. code-block:: yaml
+        tuner:
+          name: PBTTuner
+          classArgs:
+            optimize_mode: maximize
+            all_checkpoint_dir: /the/path/to/store/checkpoints
+            population_size: 10
+    Notes
+    -----
+    Assessor is not allowed if PBTTuner is used.
+    """
    def __init__(self, optimize_mode="maximize", all_checkpoint_dir=None, population_size=10, factor=0.2,
                 resample_probability=0.25, fraction=0.2):
-        """
-        Initialization
-        Parameters
-        ----------
-        optimize_mode : str
-            maximize or minimize
-        all_checkpoint_dir : str
-            directory to store training model checkpoint
-        population_size : int
-            number of trials for each epoch
-        factor : float
-            factor for perturbation
-        resample_probability : float
-            probability for resampling
-        fraction : float
-            fraction for selecting bottom and top trials
-        """
        self.optimize_mode = OptimizeMode(optimize_mode)
        if all_checkpoint_dir is None:
            all_checkpoint_dir = os.getenv('NNI_CHECKPOINT_DIRECTORY')

--- a/nni/algorithms/hpo/ppo_tuner/ppo_tuner.py
+++ b/nni/algorithms/hpo/ppo_tuner/ppo_tuner.py
@@ -306,40 +306,37 @@ class PPOClassArgsValidator(ClassArgsValidator):
 class PPOTuner(Tuner):
    """
    PPOTuner, the implementation inherits the main logic of the implementation
-    [ppo2 from openai](https://github.com/openai/baselines/tree/master/baselines/ppo2), and is adapted for NAS scenario.
+    `ppo2 from openai <https://github.com/openai/baselines/tree/master/baselines/ppo2>`__ and is adapted for NAS scenario.
    It uses ``lstm`` for its policy network and value network, policy and value share the same network.
+    Parameters
+    ----------
+    optimize_mode : str
+        maximize or minimize
+    trials_per_update : int
+        Number of trials to have for each model update
+    epochs_per_update : int
+        Number of epochs to run for each model update
+    minibatch_size : int
+        Minibatch size (number of trials) for the update
+    ent_coef : float
+        Policy entropy coefficient in the optimization objective
+    lr : float
+        Learning rate of the model (lstm network), constant
+    vf_coef : float
+        Value function loss coefficient in the optimization objective
+    max_grad_norm : float
+        Gradient norm clipping coefficient
+    gamma : float
+        Discounting factor
+    lam : float
+        Advantage estimation discounting factor (lambda in the paper)
+    cliprange : float
+        Cliprange in the PPO algorithm, constant
    """
    def __init__(self, optimize_mode, trials_per_update=20, epochs_per_update=4, minibatch_size=4,
                 ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, cliprange=0.2):
-        """
-        Initialization, PPO model is not initialized here as search space is not received yet.
-        Parameters
-        ----------
-        optimize_mode : str
-            maximize or minimize
-        trials_per_update : int
-            Number of trials to have for each model update
-        epochs_per_update : int
-            Number of epochs to run for each model update
-        minibatch_size : int
-            Minibatch size (number of trials) for the update
-        ent_coef : float
-            Policy entropy coefficient in the optimization objective
-        lr : float
-            Learning rate of the model (lstm network), constant
-        vf_coef : float
-            Value function loss coefficient in the optimization objective
-        max_grad_norm : float
-            Gradient norm clipping coefficient
-        gamma : float
-            Discounting factor
-        lam : float
-            Advantage estimation discounting factor (lambda in the paper)
-        cliprange : float
-            Cliprange in the PPO algorithm, constant
-        """
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.model_config = ModelConfig()
        self.model = None

--- a/nni/algorithms/hpo/random_tuner.py
+++ b/nni/algorithms/hpo/random_tuner.py
@@ -2,12 +2,14 @@
 # Licensed under the MIT license.
 """
-Naive random tuner for hyper-parameter optimization.
+Naive random tuner.
 You can specify an integer seed to determine random result.
 """
-__all__ = ['RandomTuner', 'suggest', 'suggest_parameter']
+from __future__ import annotations
+__all__ = ['RandomTuner']
 import logging
@@ -21,7 +23,26 @@ from nni.tuner import Tuner
 _logger = logging.getLogger('nni.tuner.random')
 class RandomTuner(Tuner):
-    def __init__(self, seed=None):
+    """
+    A naive tuner that generates fully random hyperparameters.
+    Examples
+    --------
+    .. code-block::
+        config.tuner.name = 'Random'
+        config.tuner.class_args = {
+            'seed': 100
+        }
+    Parameters
+    ----------
+    seed
+        The random seed.
+    """
+    def __init__(self, seed: int | None = None):
        self.space = None
        if seed is None:  # explicitly generate a seed to make the experiment reproducible
            seed = np.random.default_rng().integers(2 ** 31)

--- a/nni/algorithms/hpo/smac_tuner/smac_tuner.py
+++ b/nni/algorithms/hpo/smac_tuner/smac_tuner.py
@@ -38,20 +38,46 @@ class SMACClassArgsValidator(ClassArgsValidator):
 class SMACTuner(Tuner):
    """
-    This is a wrapper of [SMAC](https://github.com/automl/SMAC3) following NNI tuner interface.
+    `SMAC <https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf>`__ is based on Sequential Model-Based Optimization (SMBO).
-    It only supports ``SMAC`` mode, and does not support the multiple instances of SMAC3 (i.e.,
+    It adapts the most prominent previously used model class (Gaussian stochastic process models)
-    the same configuration is run multiple times).
+    and introduces the model class of random forests to SMBO in order to handle categorical parameters.
+    The SMAC supported by nni is a wrapper on `the SMAC3 github repo <https://github.com/automl/SMAC3>`__,
+    following NNI tuner interface :class:`nni.tuner.Tuner`. For algorithm details of SMAC, please refer to the paper
+    :footcite:t:`hutter2011sequential`.
+    Note that SMAC on nni only supports a subset of the types in
+    :doc:`search space </hpo/search_space>`:
+    ``choice``, ``randint``, ``uniform``, ``loguniform``, and ``quniform``.
+    Note that SMAC needs additional installation using the following command:
+    .. code-block:: bash
+        pip install nni[SMAC]
+    ``swig`` is required for SMAC. for Ubuntu ``swig`` can be installed with ``apt``.
+    Examples
+    --------
+    .. code-block::
+        config.tuner.name = 'SMAC'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize'
+        }
+    Parameters
+    ----------
+    optimize_mode : str
+        Optimize mode, 'maximize' or 'minimize', by default 'maximize'
+    config_dedup : bool
+        If True, the tuner will not generate a configuration that has been already generated.
+        If False, a configuration may be generated twice, but it is rare for relatively large search space.
    """
    def __init__(self, optimize_mode="maximize", config_dedup=False):
-        """
-        Parameters
-        ----------
-        optimize_mode : str
-            Optimize mode, 'maximize' or 'minimize', by default 'maximize'
-        config_dedup : bool
-            If True, the tuner will not generate a configuration that has been already generated.
-            If False, a configuration may be generated twice, but it is rare for relatively large search space.
-        """
        self.logger = logger
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.total_data = {}

--- a/nni/algorithms/hpo/tpe_tuner.py
+++ b/nni/algorithms/hpo/tpe_tuner.py
@@ -2,26 +2,30 @@
 # Licensed under the MIT license.
 """
-Tree-structured Parzen Estimator (TPE) tuner for hyper-parameter optimization.
+Tree-structured Parzen Estimator (TPE) tuner.
 Paper: https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf
 Official code: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/tpe.py
 This is a slightly modified re-implementation of the algorithm.
 """
-__all__ = ['TpeTuner', 'TpeArguments', 'suggest', 'suggest_parameter']
+from __future__ import annotations
+__all__ = ['TpeTuner', 'TpeArguments']
 from collections import defaultdict
 import logging
 import math
-from typing import NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 import numpy as np
 from scipy.special import erf  # pylint: disable=no-name-in-module
-from nni.tuner import Tuner
 from nni.common.hpo_utils import OptimizeMode, format_search_space, deformat_parameters, format_parameters
+from nni.tuner import Tuner
+from nni.typehint import Literal
 from nni.utils import extract_scalar_reward
 from . import random_tuner
@@ -31,12 +35,13 @@ _logger = logging.getLogger('nni.tuner.tpe')
 class TpeArguments(NamedTuple):
    """
-    These are the hyper-parameters of TPE algorithm itself.
+    Hyperparameters of TPE algorithm itself.
-    To avoid confusing with trials' hyper-parameters, they are called "arguments" in this code.
+    To avoid confusing with trials' hyperparameters to be tuned, these are called "arguments" here.
    Parameters
-    ==========
+    ----------
-    constant_liar_type: 'best' | 'worst' | 'mean' | None (default: 'best')
+    constant_liar_type
        TPE algorithm itself does not support parallel tuning.
        This parameter specifies how to optimize for trial_concurrency > 1.
@@ -44,20 +49,21 @@ class TpeArguments(NamedTuple):
        How each liar works is explained in paper's section 6.1.
        In general "best" suit for small trial number and "worst" suit for large trial number.
+        (:doc:`experiment result </misc/parallelizing_tpe_search>`)
-    n_startup_jobs: int (default: 20)
+    n_startup_jobs
-        The first N hyper-parameters are generated fully randomly for warming up.
+        The first N hyperparameters are generated fully randomly for warming up.
        If the search space is large, you can increase this value.
        Or if max_trial_number is small, you may want to decrease it.
-    n_ei_candidates: int (default: 24)
+    n_ei_candidates
        For each iteration TPE samples EI for N sets of parameters and choose the best one. (loosely speaking)
-    linear_forgetting: int (default: 25)
+    linear_forgetting
        TPE will lower the weights of old trials.
        This controls how many iterations it takes for a trial to start decay.
-    prior_weight: float (default: 1.0)
+    prior_weight
        TPE treats user provided search space as prior.
        When generating new trials, it also incorporates the prior in trial history by transforming the search space to
        one trial configuration (i.e., each parameter of this configuration chooses the mean of its candidate range).
@@ -66,11 +72,11 @@ class TpeArguments(NamedTuple):
        With prior weight 1.0, the search space is treated as one good trial.
        For example, "normal(0, 1)" effectly equals to a trial with x = 0 which has yielded good result.
-    gamma: float (default: 0.25)
+    gamma
        Controls how many trials are considered "good".
        The number is calculated as "min(gamma * sqrt(N), linear_forgetting)".
    """
-    constant_liar_type: Optional[str] = 'best'
+    constant_liar_type: Literal['best', 'worst', 'mean'] | None = 'best'
    n_startup_jobs: int = 20
    n_ei_candidates: int = 24
    linear_forgetting: int = 25
@@ -79,18 +85,68 @@ class TpeArguments(NamedTuple):
 class TpeTuner(Tuner):
    """
+    Tree-structured Parzen Estimator (TPE) tuner.
+    TPE is a lightweight tuner that has no extra dependency and supports all search space types,
+    designed to be the default tuner.
+    It has the drawback that TPE cannot discover relationship between different hyperparameters.
+    **Implementation**
+    TPE is an SMBO algorithm.
+    It models P(x|y) and P(y) where x represents hyperparameters and y the evaluation result.
+    P(x|y) is modeled by transforming the generative process of hyperparameters,
+    replacing the distributions of the configuration prior with non-parametric densities.
+    Paper: `Algorithms for Hyper-Parameter Optimization
+    <https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf>`__
+    Examples
+    --------
+    .. code-block::
+        ## minimal config ##
+        config.tuner.name = 'TPE'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize'
+        }
+    .. code-block::
+        ## advanced config ##
+        config.tuner.name = 'TPE'
+        config.tuner.class_args = {
+            'optimize_mode': maximize,
+            'seed': 12345,
+            'tpe_args': {
+                'constant_liar_type': 'mean',
+                'n_startup_jobs': 10,
+                'n_ei_candidates': 20,
+                'linear_forgetting': 100,
+                'prior_weight': 0,
+                'gamma': 0.5
+            }
+        }
    Parameters
-    ==========
+    ----------
-    optimze_mode: 'minimize' | 'maximize' (default: 'minimize')
+    optimze_mode: Literal['minimize', 'maximize']
        Whether optimize to minimize or maximize trial result.
-    seed: int | None
+    seed
        The random seed.
-    tpe_args: dict[string, Any] | None
+    tpe_args
        Advanced users can use this to customize TPE tuner.
-        See `TpeArguments` for details.
+        See :class:`TpeArguments` for details.
    """
-    def __init__(self, optimize_mode='minimize', seed=None, tpe_args=None):
+    def __init__(self,
+            optimize_mode: Literal['minimize', 'maximize'] = 'minimize',
+            seed: int | None = None,
+            tpe_args: dict[str, Any] | None = None):
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.args = TpeArguments(**(tpe_args or {}))
        self.space = None
@@ -183,7 +239,7 @@ def suggest_parameter(args, rng, spec, parameter_history):
 ## Utilities part ##
 class Record(NamedTuple):
-    param: Union[int, float]
+    param: int | float
    loss: float
 class BestLiar:  # assume running parameters have best result, it accelerates "converging"
@@ -305,7 +361,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
    this function is used for everything other than "choice" and "randint".
    Parameters
-    ==========
+    ----------
    args: TpeArguments
        Algorithm arguments.
    history_mus: 1-d array of float
@@ -317,7 +373,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
        σ value of normal search space.
    Returns
-    =======
+    -------
    Tuple of three 1-d float arrays: (weight, µ, σ).
    The tuple represents N+1 "vicinity of observations" and each one's weight,

--- a/nni/common/serializer.py
+++ b/nni/common/serializer.py
@@ -5,6 +5,7 @@ import copy
 import functools
 import inspect
 import numbers
+import os
 import sys
 import types
 import warnings
@@ -257,6 +258,13 @@ def trace(cls_or_func: T = None, *, kw_only: bool = True, inheritable: bool = Fa
            pass
    """
+    # This is an internal flag to control the behavior of trace.
+    # Useful in doc build and tests.
+    # Might be changed in future.
+    nni_trace_flag = os.environ.get('NNI_TRACE_FLAG', '')
+    if nni_trace_flag.lower() == 'disable':
+        return cls_or_func
    def wrap(cls_or_func):
        # already annotated, do nothing
        if is_wrapped_with_trace(cls_or_func):