Resolve conflicts for #4760 (#4762)

a911b856 · Yuge Zhang · GitHub · 14d2966b · a911b856 · a911b856
Unverified Commit a911b856 authored Apr 21, 2022 by Yuge Zhang Committed by GitHub Apr 21, 2022
20 changed files
--- a/nni/algorithms/compression/v2/pytorch/utils/__init__.py
+++ b/nni/algorithms/compression/v2/pytorch/utils/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from .config_validation import CompressorSchema
 from .pruning import (
    config_list_canonical,

--- a/nni/algorithms/compression/v2/pytorch/utils/constructor_helper.py
+++ b/nni/algorithms/compression/v2/pytorch/utils/constructor_helper.py
@@ -10,7 +10,7 @@ from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler

 from nni.common.serializer import _trace_cls
-from nni.common.serializer import Traceable
+from nni.common.serializer import Traceable, is_traceable

 __all__ = ['OptimizerConstructHelper', 'LRSchedulerConstructHelper']

@@ -80,14 +80,14 @@ class OptimizerConstructHelper(ConstructHelper):

    @staticmethod
    def from_trace(model: Module, optimizer_trace: Traceable):
-        assert isinstance(optimizer_trace, Traceable), \
+        assert is_traceable(optimizer_trace), \
            'Please use nni.trace to wrap the optimizer class before initialize the optimizer.'
        assert isinstance(optimizer_trace, Optimizer), \
            'It is not an instance of torch.nn.Optimizer.'
        return OptimizerConstructHelper(model,
-                                        optimizer_trace._get_nni_attr('symbol'),
-                                        *optimizer_trace._get_nni_attr('args'),
-                                        **optimizer_trace._get_nni_attr('kwargs'))
+                                        optimizer_trace.trace_symbol,
+                                        *optimizer_trace.trace_args,
+                                        **optimizer_trace.trace_kwargs)


 class LRSchedulerConstructHelper(ConstructHelper):
@@ -112,7 +112,7 @@ class LRSchedulerConstructHelper(ConstructHelper):

    @staticmethod
    def from_trace(lr_scheduler_trace: Traceable):
-        assert isinstance(lr_scheduler_trace, Traceable), \
+        assert is_traceable(lr_scheduler_trace), \
            'Please use nni.trace to wrap the lr scheduler class before initialize the scheduler.'
        assert isinstance(lr_scheduler_trace, _LRScheduler), \
            'It is not an instance of torch.nn.lr_scheduler._LRScheduler.'

--- a/nni/algorithms/compression/v2/pytorch/utils/pruning.py
+++ b/nni/algorithms/compression/v2/pytorch/utils/pruning.py
@@ -198,7 +198,18 @@ def compute_sparsity(origin_model: Module, compact_model: Module, compact_model_
    The current state means `compact_model` + `compact_model_masks`
    (i.e., `compact_model_masks` applied on `compact_model`).
    The compact model is the origin model after pruning,
-    and it may have different structure with origin_model cause of speed up.
+    and it may have different structure with origin_model cause of speedup.
+
+    Parameters
+    ----------
+    origin_model : torch.nn.Module
+        The original un-pruned model.
+    compact_model : torch.nn.Module
+        The model after speedup or original model.
+    compact_model_masks: Dict[str, Dict[str, Tensor]]
+        The masks applied on the compact model, if the original model have been speedup, this should be {}.
+    config_list : List[Dict]
+        The config_list used by pruning the original model.

    Returns
    -------

--- a/nni/algorithms/feature_engineering/gbdt_selector/__init__.py
+++ b/nni/algorithms/feature_engineering/gbdt_selector/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from .gbdt_selector import GBDTSelector
\ No newline at end of file
--- a/nni/algorithms/feature_engineering/gradient_selector/__init__.py
+++ b/nni/algorithms/feature_engineering/gradient_selector/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from .gradient_selector import FeatureGradientSelector
\ No newline at end of file
--- a/nni/algorithms/hpo/batch_tuner.py
+++ b/nni/algorithms/hpo/batch_tuner.py
@@ -20,27 +20,64 @@ LOGGER = logging.getLogger('batch_tuner_AutoML')

 class BatchTuner(Tuner):
    """
-    BatchTuner is tuner will running all the configure that user want to run batchly.
+    Batch tuner is a special tuner that allows users to simply provide several hyperparameter sets,
+    and it will evaluate each set.
+
+    Batch tuner does **not** support standard search space.
+
+    Search space of batch tuner looks like a single ``choice`` in standard search space,
+    but it has different meaning.
+
+    Consider following search space:
+
+    .. code-block::
+
+        'combine_params': {
+            '_type': 'choice',
+            '_value': [
+                {'x': 0, 'y': 1},
+                {'x': 1, 'y': 2},
+                {'x': 1, 'y': 3},
+            ]
+        }
+
+    Batch tuner will generate following 4 hyperparameter sets:
+
+    1. {'x': 0, 'y': 1}
+    2. {'x': 1, 'y': 2}
+    3. {'x': 1, 'y': 3}
+
+    If this search space was used with grid search tuner, it would instead generate:
+
+    1. {'combine_params': {'x': 0, 'y': 1 }}
+    2. {'combine_params': {'x': 1, 'y': 2 }}
+    3. {'combine_params': {'x': 1, 'y': 3 }}

    Examples
    --------
-    The search space only be accepted like:

-        ::
-
-            {'combine_params':
-                { '_type': 'choice',
-                            '_value': '[{...}, {...}, {...}]',
-                }
+    .. code-block::
+
+        config.search_space = {
+            'combine_params': {
+                '_type': 'choice',
+                '_value': [
+                    {'optimizer': 'Adam', 'learning_rate': 0.001},
+                    {'optimizer': 'Adam', 'learning_rate': 0.0001},
+                    {'optimizer': 'Adam', 'learning_rate': 0.00001},
+                    {'optimizer': 'SGD', 'learning_rate': 0.01},
+                    {'optimizer': 'SGD', 'learning_rate': 0.005},
+                ]
            }
-
+        }
+        config.tuner.name = 'BatchTuner'
    """

    def __init__(self):
        self._count = -1
        self._values = []

-    def is_valid(self, search_space):
+    def _is_valid(self, search_space):
        """
        Check the search space is valid: only contains 'choice' type

@@ -70,27 +107,10 @@ class BatchTuner(Tuner):
        return None

    def update_search_space(self, search_space):
-        """Update the search space
-
-        Parameters
-        ----------
-        search_space : dict
-        """
        validate_search_space(search_space, ['choice'])
-        self._values = self.is_valid(search_space)
+        self._values = self._is_valid(search_space)

    def generate_parameters(self, parameter_id, **kwargs):
-        """Returns a dict of trial (hyper-)parameters, as a serializable object.
-
-        Parameters
-        ----------
-        parameter_id : int
-
-        Returns
-        -------
-        dict
-            A candidate parameter group.
-        """
        self._count += 1
        if self._count > len(self._values) - 1:
            raise nni.NoMoreTrialError('no more parameters now.')
@@ -100,13 +120,6 @@ class BatchTuner(Tuner):
        pass

    def import_data(self, data):
-        """Import additional data for tuning
-
-        Parameters
-        ----------
-        data:
-            a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
-        """
        if not self._values:
            LOGGER.info("Search space has not been initialized, skip this data import")
            return

--- a/nni/algorithms/hpo/bohb_advisor/__init__.py
+++ b/nni/algorithms/hpo/bohb_advisor/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from .bohb_advisor import BOHB, BOHBClassArgsValidator
--- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
+++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
@@ -249,20 +249,52 @@ class BOHBClassArgsValidator(ClassArgsValidator):

 class BOHB(MsgDispatcherBase):
    """
-    BOHB performs robust and efficient hyperparameter optimization
-    at scale by combining the speed of Hyperband searches with the
-    guidance and guarantees of convergence of Bayesian Optimization.
-    Instead of sampling new configurations at random, BOHB uses
-    kernel density estimators to select promising candidates.
+    `BOHB <https://arxiv.org/abs/1807.01774>`__ is a robust and efficient hyperparameter tuning algorithm at scale.
+    BO is an abbreviation for "Bayesian Optimization" and HB is an abbreviation for "Hyperband".
+
+    BOHB relies on HB (Hyperband) to determine how many configurations to evaluate with which budget,
+    but it replaces the random selection of configurations at the beginning of each HB iteration
+    by a model-based search (Bayesian Optimization).
+    Once the desired number of configurations for the iteration is reached,
+    the standard successive halving procedure is carried out using these configurations.
+    It keeps track of the performance of all function evaluations g(x, b) of configurations x
+    on all budgets b to use as a basis for our models in later iterations.
+    Please refer to the paper :footcite:t:`falkner2018bohb` for detailed algorithm.
+
+    Note that BOHB needs additional installation using the following command:
+
+    .. code-block:: bash
+
+        pip install nni[BOHB]
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.advisor.name = 'BOHB'
+        config.advisor.class_args = {
+            'optimize_mode': 'maximize',
+            'min_budget': 1,
+            'max_budget': 27,
+            'eta': 3,
+            'min_points_in_model': 7,
+            'top_n_percent': 15,
+            'num_samples': 64,
+            'random_fraction': 0.33,
+            'bandwidth_factor': 3.0,
+            'min_bandwidth': 0.001
+        }

    Parameters
    ----------
    optimize_mode: str
-        optimize mode, 'maximize' or 'minimize'
+        Optimize mode, 'maximize' or 'minimize'.
    min_budget: float
-        The smallest budget to consider. Needs to be positive!
+        The smallest budget to assign to a trial job, (budget can be the number of mini-batches or epochs).
+        Needs to be positive.
    max_budget: float
-        The largest budget to consider. Needs to be larger than min_budget!
+        The largest budget to assign to a trial job. Needs to be larger than min_budget.
        The budgets will be geometrically distributed
        :math:`a^2 + b^2 = c^2 \\sim \\eta^k` for :math:`k\\in [0, 1, ... , num\\_subsets - 1]`.
    eta: int
@@ -271,21 +303,102 @@ class BOHB(MsgDispatcherBase):
        1/eta of them 'advances' to the next round.
        Must be greater or equal to 2.
    min_points_in_model: int
-        number of observations to start building a KDE. Default 'None' means
-        dim+1, the bare minimum.
+        Number of observations to start building a KDE. Default 'None' means dim+1;
+        when the number of completed trials in this budget is equal to or larger than ``max{dim+1, min_points_in_model}``,
+        BOHB will start to build a KDE model of this budget then use said KDE model to guide configuration selection.
+        Needs to be positive. (dim means the number of hyperparameters in search space)
    top_n_percent: int
-        percentage ( between 1 and 99, default 15) of the observations that are considered good.
+        Percentage (between 1 and 99, default 15) of the observations which are considered good.
+        Good points and bad points are used for building KDE models.
+        For example, if you have 100 observed trials and top_n_percent is 15,
+        then the top 15% of points will be used for building the good points models "l(x)".
+        The remaining 85% of points will be used for building the bad point models "g(x)".
    num_samples: int
-        number of samples to optimize EI (default 64)
+        Number of samples to optimize EI (default 64).
+        In this case, it will sample "num_samples" points and compare the result of l(x)/g(x).
+        Then it will return the one with the maximum l(x)/g(x) value as the next configuration
+        if the optimize_mode is ``maximize``. Otherwise, it returns the smallest one.
    random_fraction: float
-    fraction of purely random configurations that are sampled from the
-        prior without the model.
+        Fraction of purely random configurations that are sampled from the prior without the model.
    bandwidth_factor: float
-        to encourage diversity, the points proposed to optimize EI, are sampled
-        from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3)
+        To encourage diversity, the points proposed to optimize EI are sampled
+        from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3).
+        It is suggested to use the default value if you are not familiar with KDE.
    min_bandwidth: float
-        to keep diversity, even when all (good) samples have the same value for one of the parameters,
-        a minimum bandwidth (Default: 1e-3) is used instead of zero.
+        To keep diversity, even when all (good) samples have the same value for one of the parameters,
+        a minimum bandwidth (default: 1e-3) is used instead of zero.
+        It is suggested to use the default value if you are not familiar with KDE.
+    config_space: str
+        Directly use a .pcs file serialized by `ConfigSpace <https://automl.github.io/ConfigSpace/>` in "pcs new" format.
+        In this case, search space file (if provided in config) will be ignored.
+        Note that this path needs to be an absolute path. Relative path is currently not supported.
+
+    Notes
+    -----
+
+    Below is the introduction of the BOHB process separated in two parts:
+
+    **The first part HB (Hyperband).**
+    BOHB follows Hyperband’s way of choosing the budgets and continue to use SuccessiveHalving.
+    For more details, you can refer to the :class:`nni.algorithms.hpo.hyperband_advisor.Hyperband`
+    and the `reference paper for Hyperband <https://arxiv.org/abs/1603.06560>`__.
+    This procedure is summarized by the pseudocode below.
+
+    .. image:: ../../img/bohb_1.png
+        :scale: 80 %
+        :align: center
+
+    **The second part BO (Bayesian Optimization)**
+    The BO part of BOHB closely resembles TPE with one major difference:
+    It opted for a single multidimensional KDE compared to the hierarchy of one-dimensional KDEs used in TPE
+    in order to better handle interaction effects in the input space.
+    Tree Parzen Estimator(TPE): uses a KDE (kernel density estimator) to model the densities.
+
+    .. image:: ../../img/bohb_2.png
+        :scale: 80 %
+        :align: center
+
+    To fit useful KDEs, we require a minimum number of data points Nmin;
+    this is set to d + 1 for our experiments, where d is the number of hyperparameters.
+    To build a model as early as possible, we do not wait until Nb = \|Db\|,
+    where the number of observations for budget b is large enough to satisfy q · Nb ≥ Nmin.
+    Instead, after initializing with Nmin + 2 random configurations, we choose the
+    best and worst configurations, respectively, to model the two densities.
+    Note that it also samples a constant fraction named **random fraction** of the configurations uniformly at random.
+
+    .. image:: ../../img/bohb_3.png
+        :scale: 80 %
+        :align: center
+
+
+    .. image:: ../../img/bohb_6.jpg
+        :scale: 65 %
+        :align: center
+
+    **The above image shows the workflow of BOHB.**
+    Here set max_budget = 9, min_budget = 1, eta = 3, others as default.
+    In this case, s_max = 2, so we will continuously run the {s=2, s=1, s=0, s=2, s=1, s=0, ...} cycle.
+    In each stage of SuccessiveHalving (the orange box), it will pick the top 1/eta configurations and run them again with more budget,
+    repeating the SuccessiveHalving stage until the end of this iteration.
+    At the same time, it collects the configurations, budgets and final metrics of each trial
+    and use these to build a multidimensional KDEmodel with the key "budget".
+    Multidimensional KDE is used to guide the selection of configurations for the next iteration.
+    The sampling procedure (using Multidimensional KDE to guide selection) is summarized by the pseudocode below.
+
+    .. image:: ../../img/bohb_4.png
+        :scale: 80 %
+        :align: center
+
+    **Here is a simple experiment which tunes MNIST with BOHB.**
+    Code implementation: :githublink:`examples/trials/mnist-advisor <examples/trials/mnist-advisor>`
+    The following is the experimental final results:
+
+    .. image:: ../../img/bohb_5.png
+        :scale: 80 %
+        :align: center
+
+    More experimental results can be found in the `reference paper <https://arxiv.org/abs/1807.01774>`__.
+    It shows that BOHB makes good use of previous results and has a balanced trade-off in exploration and exploitation.
    """

    def __init__(self,

--- a/nni/algorithms/hpo/curvefitting_assessor/curvefitting_assessor.py
+++ b/nni/algorithms/hpo/curvefitting_assessor/curvefitting_assessor.py
@@ -22,18 +22,52 @@ class CurvefittingClassArgsValidator(ClassArgsValidator):
        }).validate(kwargs)

 class CurvefittingAssessor(Assessor):
-    """CurvefittingAssessor uses learning curve fitting algorithm to predict the learning curve performance in the future.
+    """
+    CurvefittingAssessor uses learning curve fitting algorithm to predict the learning curve performance in the future.
+
+    The intermediate result **must** be accuracy. Curve fitting does not support minimizing loss.
+
+    Curve fitting assessor is an LPA (learning, predicting, assessing) algorithm.
    It stops a pending trial X at step S if the trial's forecast result at target step is convergence and lower than the
    best performance in the history.

+    Paper: `Speeding up Automatic Hyperparameter Optimization of Deep Neural Networks by Extrapolation of Learning Curves
+    <https://ml.informatik.uni-freiburg.de/wp-content/uploads/papers/15-IJCAI-Extrapolation_of_Learning_Curves.pdf>`__
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.assessor.name = 'Curvefitting'
+        config.tuner.class_args = {
+            'epoch_num': 20,
+            'start_step': 6,
+            'threshold': 9,
+            'gap': 1,
+        }
+
    Parameters
    ----------
    epoch_num : int
-        The total number of epoch
+        The total number of epochs.
+
+        We need to know the number of epochs to determine which points we need to predict.
+
    start_step : int
-        only after receiving start_step number of reported intermediate results
+        A trial is determined to be stopped or not only after receiving start_step number of intermediate results.
+
    threshold : float
-        The threshold that we decide to early stop the worse performance curve.
+        The threshold that we use to decide to early stop the worst performance curve.
+
+        For example: if threshold = 0.95, and the best performance in the history is 0.9,
+        then we will stop the trial who's predicted value is lower than 0.95 * 0.9 = 0.855.
+
+    gap : int
+        The gap interval between assessor judgements.
+
+        For example: if gap = 2, start_step = 6,
+        then we will assess the result when we get 6, 8, 10, 12, ... intermediate results.
    """

    def __init__(self, epoch_num=20, start_step=6, threshold=0.95, gap=1):
@@ -56,15 +90,6 @@ class CurvefittingAssessor(Assessor):
        logger.info('Successfully initials the curvefitting assessor')

    def trial_end(self, trial_job_id, success):
-        """update the best performance of completed trial job
-
-        Parameters
-        ----------
-        trial_job_id : int
-            trial job id
-        success : bool
-            True if succssfully finish the experiment, False otherwise
-        """
        if success:
            if self.set_best_performance:
                self.completed_best_performance = max(self.completed_best_performance, self.trial_history[-1])
@@ -76,25 +101,6 @@ class CurvefittingAssessor(Assessor):
            logger.info('No need to update, trial job id: %s', trial_job_id)

    def assess_trial(self, trial_job_id, trial_history):
-        """assess whether a trial should be early stop by curve fitting algorithm
-
-        Parameters
-        ----------
-        trial_job_id : int
-            trial job id
-        trial_history : list
-            The history performance matrix of each trial
-
-        Returns
-        -------
-        bool
-            AssessResult.Good or AssessResult.Bad
-
-        Raises
-        ------
-        Exception
-            unrecognize exception in curvefitting_assessor
-        """
        scalar_trial_history = extract_scalar_history(trial_history)
        self.trial_history = scalar_trial_history
        if not self.set_best_performance:

--- a/nni/algorithms/hpo/dngo_tuner.py
+++ b/nni/algorithms/hpo/dngo_tuner.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 import logging
 import warnings

@@ -44,7 +47,20 @@ def _random_config(search_space, random_state):


 class DNGOTuner(Tuner):
-
+    """
+    Use neural networks as an alternative to GPs to model distributions over functions in bayesian optimization.
+
+    Parameters
+    ----------
+    optimize : maximize | minimize, default = maximize
+        If 'maximize', the tuner will target to maximize metrics. If 'minimize', the tuner will target to minimize metrics.
+    sample_size : int, default = 1000
+        Number of samples to select in each iteration. The best one will be picked from the samples as the next trial.
+    trials_per_update : int, default = 20
+        Number of trials to collect before updating the model.
+    num_epochs_per_training : int, default = 500
+        Number of epochs to train DNGO model.
+    """
    def __init__(self, optimize_mode='maximize', sample_size=1000, trials_per_update=20, num_epochs_per_training=500):
        self.searchspace_json = None
        self.random_state = None

--- a/nni/algorithms/hpo/evolution_tuner.py
+++ b/nni/algorithms/hpo/evolution_tuner.py
@@ -4,6 +4,7 @@
 """
 evolution_tuner.py
 """
+from __future__ import annotations

 import copy
 import random
@@ -22,28 +23,19 @@ logger = logging.getLogger(__name__)

 class Individual:
    """
-    Indicidual class to store the indv info.
+    Individual class to store the indv info.

-    Attributes
+    Parameters
    ----------
-    config : str
+    config : str, default = None
        Search space.
-    info : str
+    info : str, default = None
        The str to save information of individual.
-    result : float
+    result : float, None = None
        The final metric of a individual.
    """

    def __init__(self, config=None, info=None, result=None):
-        """
-        Parameters
-        ----------
-        config : str
-            A config to represent a group of parameters.
-        info : str
-        result : float
-        save_dir : str
-        """
        self.config = config
        self.result = result
        self.info = info
@@ -61,18 +53,36 @@ class EvolutionClassArgsValidator(ClassArgsValidator):

 class EvolutionTuner(Tuner):
    """
-    EvolutionTuner is tuner using navie evolution algorithm.
+    Naive Evolution comes from `Large-Scale Evolution of Image Classifiers <https://arxiv.org/pdf/1703.01041.pdf>`__
+    It randomly initializes a population based on the search space.
+    For each generation, it chooses better ones and does some mutation.
+    (e.g., changes a hyperparameter, adds/removes one layer, etc.) on them to get the next generation.
+    Naive Evolution requires many trials to works but it’s very simple and it’s easily expanded with new features.
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.tuner.name = 'Evolution'
+        config.tuner.class_args = {
+                'optimize_mode': 'maximize',
+                'population_size': 100
+        }
+
+    Parameters
+    ----------
+    optimize_mode: str
+        Optimize mode, 'maximize' or 'minimize'.
+        If 'maximize', the tuner will try to maximize metrics. If 'minimize', the tuner will try to minimize metrics.
+    population_size: int
+        The initial size of the population (trial num) in the evolution tuner(default=32).
+        The larger population size, the better evolution performance.
+        It's suggested that ``population_size`` be much larger than ``concurrency`` so users can get the most out of the algorithm.
+        And at least ``concurrency``, or the tuner will fail on its first generation of parameters.
    """

-    def __init__(self, optimize_mode="maximize", population_size=32):
-        """
-        Parameters
-        ----------
-        optimize_mode : str, default 'maximize'
-        population_size : int
-            initial population size. The larger population size,
-        the better evolution performance.
-        """
+    def __init__(self, optimize_mode='maximize', population_size=32):
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.population_size = population_size

@@ -89,11 +99,11 @@ class EvolutionTuner(Tuner):
    def update_search_space(self, search_space):
        """
        Update search space.
-
        Search_space contains the information that user pre-defined.

        Parameters
        ----------
+
        search_space : dict
        """
        self.searchspace_json = search_space
@@ -109,8 +119,10 @@ class EvolutionTuner(Tuner):
        """
        To deal with trial failure. If a trial fails,
        random generate the parameters and add into the population.
+
        Parameters
        ----------
+
        parameter_id : int
            Unique identifier for hyper-parameters used by this trial.
        success : bool
@@ -136,12 +148,15 @@ class EvolutionTuner(Tuner):
    def generate_multiple_parameters(self, parameter_id_list, **kwargs):
        """
        Returns multiple sets of trial (hyper-)parameters, as iterable of serializable objects.
+
        Parameters
        ----------
+
        parameter_id_list : list of int
            Unique identifiers for each set of requested hyper-parameters.
        **kwargs
            Not used
+
        Returns
        -------
        list
@@ -182,12 +197,13 @@ class EvolutionTuner(Tuner):

        Parameters
        ----------
+
        parameter_id : int

        Returns
        -------
        dict
-            A group of candaidte parameters that evolution tuner generated.
+            A group of candidate parameters that evolution tuner generated.
        """
        pos = -1

@@ -234,10 +250,12 @@ class EvolutionTuner(Tuner):

        Parameters
        ----------
+
        parameter_id : int

        Returns
        -------
+
        dict
            One newly generated configuration.
        """
@@ -258,6 +276,7 @@ class EvolutionTuner(Tuner):

        Parameters
        ----------
+
        parameter_id : int
        parameters : dict
        value : dict/float

--- a/nni/algorithms/hpo/gp_tuner/__init__.py
+++ b/nni/algorithms/hpo/gp_tuner/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from .gp_tuner import GPTuner, GPClassArgsValidator
--- a/nni/algorithms/hpo/gp_tuner/gp_tuner.py
+++ b/nni/algorithms/hpo/gp_tuner/gp_tuner.py
@@ -41,29 +41,77 @@ class GPClassArgsValidator(ClassArgsValidator):

 class GPTuner(Tuner):
    """
-    GPTuner is a Bayesian Optimization method where Gaussian Process is used for modeling loss functions.
+    GPTuner is a Bayesian Optimization method where Gaussian Process
+    is used for modeling loss functions.
+
+    Bayesian optimization works by constructing a posterior distribution of functions
+    (a Gaussian Process) that best describes the function you want to optimize.
+    As the number of observations grows, the posterior distribution improves,
+    and the algorithm becomes more certain of which regions in parameter space
+    are worth exploring and which are not.
+
+    GPTuner is designed to minimize/maximize the number of steps required to find
+    a combination of parameters that are close to the optimal combination.
+    To do so, this method uses a proxy optimization problem (finding the maximum of
+    the acquisition function) that, albeit still a hard problem, is cheaper
+    (in the computational sense) to solve, and it's amenable to common tools.
+    Therefore, Bayesian Optimization is suggested for situations where sampling the function
+    to be optimized is very expensive.
+
+    Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
+    ``randint``, ``uniform``, ``quniform``, ``loguniform``, ``qloguniform``, and numerical ``choice``.
+
+    This optimization approach is described in Section 3 of the paper
+    `Algorithms for Hyper-Parameter Optimization <https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf>`__
+    ( :footcite:t:`bergstra2011algorithms` ).
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.tuner.name = 'GPTuner'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize',
+            'utility': 'ei',
+            'kappa': 5.0,
+            'xi': 0.0,
+            'nu': 2.5,
+            'alpha': 1e-6,
+            'cold_start_num': 10,
+            'selection_num_warm_up': 100000,
+            'selection_num_starting_points': 250
+        }

    Parameters
    ----------
    optimize_mode : str
-        optimize mode, 'maximize' or 'minimize', by default 'maximize'
+        Optimize mode, 'maximize' or 'minimize', by default 'maximize'
    utility : str
-        utility function (also called 'acquisition funcition') to use, which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
+        Utility function (also called 'acquisition funcition') to use,
+        which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
    kappa : float
-        value used by utility function 'ucb'. The bigger kappa is, the more the tuner will be exploratory. By default 5.
+        Value used by utility function 'ucb'. The bigger kappa is,
+        the more the tuner will be exploratory. By default 5.
    xi : float
-        used by utility function 'ei' and 'poi'. The bigger xi is, the more the tuner will be exploratory. By default 0.
+        Used by utility function 'ei' and 'poi'. The bigger xi is,
+        the more the tuner will be exploratory. By default 0.
    nu : float
-        used to specify Matern kernel. The smaller nu, the less smooth the approximated function is. By default 2.5.
+        Used to specify Matern kernel. The smaller nu,
+        the less smooth the approximated function is. By default 2.5.
    alpha : float
-        Used to specify Gaussian Process Regressor. Larger values correspond to increased noise level in the observations.
+        Used to specify Gaussian Process Regressor.
+        Larger values correspond to increased noise level in the observations.
        By default 1e-6.
    cold_start_num : int
-        Number of random exploration to perform before Gaussian Process. By default 10.
+        Number of random exploration to perform before Gaussian Process.
+        By default 10.
    selection_num_warm_up : int
-        Number of random points to evaluate for getting the point which maximizes the acquisition function. By default 100000
+        Number of random points to evaluate for getting the point which
+        maximizes the acquisition function. By default 100000
    selection_num_starting_points : int
-        Number of times to run L-BFGS-B from a random starting point after the warmup. By default 250.
+        Number of times to run L-BFGS-B from a random starting point after the warmup.
+        By default 250.
    """

    def __init__(self, optimize_mode="maximize", utility='ei', kappa=5, xi=0, nu=2.5, alpha=1e-6, cold_start_num=10,

--- a/nni/algorithms/hpo/gridsearch_tuner.py
+++ b/nni/algorithms/hpo/gridsearch_tuner.py
@@ -2,14 +2,10 @@
 # Licensed under the MIT license.

 """
-Grid search tuner for hyper-parameter optimization.
+Grid search tuner.

 For categorical parameters this tuner fully explore all combinations.
 For numerical parameters it samples them at progressively decreased intervals.
-
-Use this tuner if you have abundant resource and want to find strictly optimal parameters.
-
-Grid search tuner has no argument.
 """

 __all__ = ['GridSearchTuner']
@@ -63,6 +59,35 @@ _logger = logging.getLogger('nni.tuner.gridsearch')
 ##

 class GridSearchTuner(Tuner):
+    """
+    Grid search tuner divides search space into evenly spaced grid, and performs brute-force traverse.
+
+    Recommended when the search space is small, or if you want to find strictly optimal hyperparameters.
+
+    **Implementation**
+
+    The original grid search approach performs an exhaustive search through a space consists of ``choice`` and ``randint``.
+
+    NNI's implementation extends grid search to support all search spaces types.
+
+    When the search space contains continuous parameters like ``normal`` and ``loguniform``,
+    grid search tuner works in following steps:
+
+    1. Divide the search space into a grid.
+    2. Perform an exhaustive searth through the grid.
+    3. Subdivide the grid into a finer-grained new grid.
+    4. Goto step 2, until experiment end.
+
+    As a deterministic algorithm, grid search has no argument.
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.tuner.name = 'GridSearch'
+    """
+
    def __init__(self):
        self.space = None

@@ -175,13 +200,18 @@ class GridSearchTuner(Tuner):
                    mid = (l + r) / 2
                    diff_l = _less(l, mid, spec)
                    diff_r = _less(mid, r, spec)
-                    if diff_l and diff_r:  # we can skip these for non-q, but it will complicate the code
+                    # if l != 0 and r != 1, then they are already in the grid, else they are not
+                    # the special case is needed because for normal distribution 0 and 1 will generate infinity
+                    if (diff_l or l == 0.0) and (diff_r or r == 1.0):
+                        # we can skip these for non-q, but it will complicate the code
                        new_vals.append(mid)
                        updated = True
                    if diff_l:
                        new_divs.append((l, mid))
+                        updated = (updated or l == 0.0)
                    if diff_r:
                        new_divs.append((mid, r))
+                        updated = (updated or r == 1.0)
                self.grid[i] += new_vals
                self.divisions[i] = new_divs


--- a/nni/algorithms/hpo/hyperband_advisor.py
+++ b/nni/algorithms/hpo/hyperband_advisor.py
@@ -105,7 +105,8 @@ def json2parameter(ss_spec, random_state):


 class Bracket():
-    """A bracket in Hyperband, all the information of a bracket is managed by an instance of this class
+    """
+    A bracket in Hyperband, all the information of a bracket is managed by an instance of this class

    Parameters
    ----------
@@ -267,24 +268,136 @@ class HyperbandClassArgsValidator(ClassArgsValidator):

 class Hyperband(MsgDispatcherBase):
    """
-    Hyperband inherit from MsgDispatcherBase rather than Tuner, because it integrates both tuner's functions and assessor's functions.
-    This is an implementation that could fully leverage available resources or follow the algorithm process,
-    i.e., high parallelism or serial.
-    A single execution of Hyperband takes a finite budget of (s_max + 1)B.
+    `Hyperband <https://arxiv.org/pdf/1603.06560.pdf>`__ is a multi-fidelity hyperparameter tuning algorithm
+    based on successive halving.
+
+    The basic idea of Hyperband is to create several buckets,
+    each having ``n`` randomly generated hyperparameter configurations,
+    each configuration using ``r`` resources (e.g., epoch number, batch number).
+    After the ``n`` configurations are finished, it chooses the top ``n/eta`` configurations
+    and runs them using increased ``r*eta`` resources.
+    At last, it chooses the best configuration it has found so far.
+    Please refer to the paper :footcite:t:`li2017hyperband` for detailed algorithm.
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.advisor.name = 'Hyperband'
+        config.advisor.class_args = {
+            'optimize_mode': 'maximize',
+            'R': 60,
+            'eta': 3
+        }
+
+
+    Note that once you use Advisor, you are not allowed to add a Tuner and Assessor spec in the config file.
+    When Hyperband is used, the dict returned by :func:`nni.get_next_parameter` one more key
+    called ``TRIAL_BUDGET`` besides the hyperparameters and their values.
+    **With this TRIAL_BUDGET, users can control in trial code how long a trial runs by following
+    the suggested trial budget from Hyperband.** ``TRIAL_BUDGET`` is a relative number,
+    users can interpret them as number of epochs, number of mini-batches, running time, etc.
+
+    Here is a concrete example of ``R=81`` and ``eta=3``:
+
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+
+        * -
+          - s=4
+          - s=3
+          - s=2
+          - s=1
+          - s=0
+        * - i
+          - n r
+          - n r
+          - n r
+          - n r
+          - n r
+        * - 0
+          - 81 1
+          - 27 3
+          - 9 9
+          - 6 27
+          - 5 81
+        * - 1
+          - 27 3
+          - 9 9
+          - 3 27
+          - 2 81
+          -
+        * - 2
+          - 9 9
+          - 3 27
+          - 1 81
+          -
+          -
+        * - 3
+          - 3 27
+          - 1 81
+          -
+          -
+          -
+        * - 4
+          - 1 81
+          -
+          -
+          -
+          -
+
+
+    ``s`` means bucket, ``n`` means the number of configurations that are generated,
+    the corresponding ``r`` means how many budgets these configurations run.
+    ``i`` means round, for example, bucket 4 has 5 rounds, bucket 3 has 4 rounds.
+
+    A complete example can be found :githublink:`examples/trials/mnist-advisor`.

    Parameters
    ----------
+    optimize_mode: str
+        Optimize mode, 'maximize' or 'minimize'.
+
    R: int
-        the maximum amount of resource that can be allocated to a single configuration
+        The maximum amount of budget that can be allocated to a single configuration.
+        Here, trial budget could mean the number of epochs, number of mini-batches, etc.,
+        depending on how users interpret it.
+        Each trial should use ``TRIAL_BUDGET`` to control how long it runs.
+
    eta: int
-        the variable that controls the proportion of configurations discarded in each round of SuccessiveHalving
-    optimize_mode: str
-        optimize mode, 'maximize' or 'minimize'
+        The variable that controls the proportion of configurations discarded in each round of SuccessiveHalving.
+        ``1/eta`` configurations will survive and rerun using more budgets in each round.
+
    exec_mode: str
-        execution mode, 'serial' or 'parallelism'
+        Execution mode, 'serial' or 'parallelism'.
+        If 'parallelism', the tuner will try to use available resources to start new bucket immediately.
+        If 'serial', the tuner will only start new bucket after the current bucket is done.
+
+
+    Notes
+    -----
+
+    First, Hyperband an example of how to write an autoML algorithm based on MsgDispatcherBase,
+    rather than based on Tuner and Assessor. Hyperband is implemented in this way
+    because it integrates the functions of both Tuner and Assessor,thus, we call it Advisor.
+
+    Second, this implementation fully leverages Hyperband's internal parallelism.
+    Specifically, the next bucket is not started strictly after the current bucket.
+    Instead, it starts when there are available resources. If you want to use full parallelism mode,
+    set ``exec_mode`` to ``parallelism``.
+
+    Or if you want to set ``exec_mode`` with ``serial`` according to the original algorithm.
+    In this mode, the next bucket will start strictly after the current bucket.
+
+    ``parallelism`` mode may lead to multiple unfinished buckets,
+    in contrast, there is at most one unfinished bucket under ``serial`` mode.
+    The advantage of ``parallelism`` mode is to make full use of resources,
+    which may reduce the experiment duration multiple times.
    """

-    def __init__(self, R=60, eta=3, optimize_mode='maximize', exec_mode='parallelism'):
+    def __init__(self, optimize_mode='maximize', R=60, eta=3, exec_mode='parallelism'):
        """B = (s_max + 1)R"""
        super(Hyperband, self).__init__()
        self.R = R

--- a/nni/algorithms/hpo/hyperopt_tuner.py
+++ b/nni/algorithms/hpo/hyperopt_tuner.py
@@ -191,23 +191,31 @@ class HyperoptClassArgsValidator(ClassArgsValidator):

 class HyperoptTuner(Tuner):
    """
-    HyperoptTuner is a tuner which using hyperopt algorithm.
+    NNI wraps `hyperopt <https://github.com/hyperopt/hyperopt>`__ to provide anneal tuner.
+
+    This simple annealing algorithm begins by sampling from the prior
+    but tends over time to sample from points closer and closer to the best ones observed.
+    This algorithm is a simple variation of random search that leverages smoothness in the response surface.
+    The annealing rate is not adaptive.
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.tuner.name = 'Anneal'
+        config.tuner.class_args = {
+            'optimize_mode': 'minimize'
+        }
+
+    Parameters
+    ----------
+    optimze_mode: 'minimize' or 'maximize'
+        Whether optimize to minimize or maximize trial result.
    """

    def __init__(self, algorithm_name, optimize_mode='minimize',
                 parallel_optimize=False, constant_liar_type='min'):
-        """
-        Parameters
-        ----------
-        algorithm_name : str
-            algorithm_name includes "tpe", "random_search" and anneal".
-        optimize_mode : str
-        parallel_optimize : bool
-            More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
-        constant_liar_type : str
-            constant_liar_type including "min", "max" and "mean"
-            More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
-        """
        self.algorithm_name = algorithm_name
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.json = None
@@ -238,15 +246,6 @@ class HyperoptTuner(Tuner):
        raise RuntimeError('Not support tuner algorithm in hyperopt.')

    def update_search_space(self, search_space):
-        """
-        Update search space definition in tuner by search_space in parameters.
-
-        Will called when first setup experiemnt or update search space in WebUI.
-
-        Parameters
-        ----------
-        search_space : dict
-        """
        validate_search_space(search_space)
        self.json = search_space

@@ -266,22 +265,11 @@ class HyperoptTuner(Tuner):
        self.rval.catch_eval_exceptions = False

    def generate_parameters(self, parameter_id, **kwargs):
-        """
-        Returns a set of trial (hyper-)parameters, as a serializable object.
-
-        Parameters
-        ----------
-        parameter_id : int
-
-        Returns
-        -------
-        params : dict
-        """
-        total_params = self.get_suggestion(random_search=False)
+        total_params = self._get_suggestion(random_search=False)
        # avoid generating same parameter with concurrent trials because hyperopt doesn't support parallel mode
        if total_params in self.total_data.values():
            # but it can cause duplicate parameter rarely
-            total_params = self.get_suggestion(random_search=True)
+            total_params = self._get_suggestion(random_search=True)
        self.total_data[parameter_id] = total_params

        if self.parallel:
@@ -291,17 +279,6 @@ class HyperoptTuner(Tuner):
        return params

    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
-        """
-        Record an observation of the objective function
-
-        Parameters
-        ----------
-        parameter_id : int
-        parameters : dict
-        value : dict/float
-            if value is dict, it should have "default" key.
-            value is final metrics of the trial.
-        """
        reward = extract_scalar_reward(value)
        # restore the paramsters contains '_index'
        if parameter_id not in self.total_data:
@@ -369,7 +346,7 @@ class HyperoptTuner(Tuner):
                idxs[key] = [new_id]
                vals[key] = [vals[key]]

-        self.miscs_update_idxs_vals(rval_miscs,
+        self._miscs_update_idxs_vals(rval_miscs,
                                    idxs,
                                    vals,
                                    idxs_map={new_id: new_id},
@@ -382,7 +359,7 @@ class HyperoptTuner(Tuner):
        trials.insert_trial_docs([trial])
        trials.refresh()

-    def miscs_update_idxs_vals(self,
+    def _miscs_update_idxs_vals(self,
                               miscs,
                               idxs,
                               vals,
@@ -416,7 +393,7 @@ class HyperoptTuner(Tuner):
                    misc_by_id[tid]['idxs'][key] = [tid]
                    misc_by_id[tid]['vals'][key] = [val]

-    def get_suggestion(self, random_search=False):
+    def _get_suggestion(self, random_search=False):
        """
        get suggestion from hyperopt

@@ -469,14 +446,6 @@ class HyperoptTuner(Tuner):
        return total_params

    def import_data(self, data):
-        """
-        Import additional data for tuning
-
-        Parameters
-        ----------
-        data:
-            a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
-        """
        _completed_num = 0
        for trial_info in data:
            logger.info("Importing data, current processing progress %s / %s", _completed_num, len(data))

--- a/nni/algorithms/hpo/medianstop_assessor.py
+++ b/nni/algorithms/hpo/medianstop_assessor.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

+from __future__ import annotations
+
 import logging
 from schema import Schema, Optional

 from nni import ClassArgsValidator
 from nni.assessor import Assessor, AssessResult
+from nni.typehint import Literal
 from nni.utils import extract_scalar_history

 logger = logging.getLogger('medianstop_Assessor')
@@ -18,18 +21,35 @@ class MedianstopClassArgsValidator(ClassArgsValidator):
        }).validate(kwargs)

 class MedianstopAssessor(Assessor):
-    """MedianstopAssessor is The median stopping rule stops a pending trial X at step S
+    """
+    The median stopping rule stops a pending trial X at step S
    if the trial’s best objective value by step S is strictly worse than the median value
    of the running averages of all completed trials’ objectives reported up to step S

+    Paper: `Google Vizer: A Service for Black-Box Optimization
+    <https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf>`__
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.assessor.name = 'Medianstop'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize',
+            'start_step': 5
+        }
+
    Parameters
    ----------
-    optimize_mode : str
-        optimize mode, 'maximize' or 'minimize'
-    start_step : int
-        only after receiving start_step number of reported intermediate results
+    optimize_mode
+        Whether optimize to minimize or maximize trial result.
+    start_step
+        A trial is determined to be stopped or not
+        only after receiving start_step number of reported intermediate results.
    """
-    def __init__(self, optimize_mode='maximize', start_step=0):
+
+    def __init__(self, optimize_mode: Literal['minimize', 'maximize'] = 'maximize', start_step: int = 0):
        self._start_step = start_step
        self._running_history = dict()
        self._completed_avg_history = dict()
@@ -56,15 +76,6 @@ class MedianstopAssessor(Assessor):
        self._running_history[trial_job_id].extend(trial_history[len(self._running_history[trial_job_id]):])

    def trial_end(self, trial_job_id, success):
-        """trial_end
-
-        Parameters
-        ----------
-        trial_job_id : int
-            trial job id
-        success : bool
-            True if succssfully finish the experiment, False otherwise
-        """
        if trial_job_id in self._running_history:
            if success:
                cnt = 0
@@ -79,25 +90,6 @@ class MedianstopAssessor(Assessor):
            logger.warning('trial_end: trial_job_id does not exist in running_history')

    def assess_trial(self, trial_job_id, trial_history):
-        """assess_trial
-
-        Parameters
-        ----------
-        trial_job_id : int
-            trial job id
-        trial_history : list
-            The history performance matrix of each trial
-
-        Returns
-        -------
-        bool
-            AssessResult.Good or AssessResult.Bad
-
-        Raises
-        ------
-        Exception
-            unrecognize exception in medianstop_assessor
-        """
        curr_step = len(trial_history)
        if curr_step < self._start_step:
            return AssessResult.Good

--- a/nni/algorithms/hpo/metis_tuner/__init__.py
+++ b/nni/algorithms/hpo/metis_tuner/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from .metis_tuner import MetisTuner, MetisClassArgsValidator
--- a/nni/algorithms/hpo/metis_tuner/metis_tuner.py
+++ b/nni/algorithms/hpo/metis_tuner/metis_tuner.py
@@ -46,39 +46,74 @@ class MetisClassArgsValidator(ClassArgsValidator):

 class MetisTuner(Tuner):
    """
-    Metis Tuner
+    `Metis tuner <https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/>`__ offers
+    several benefits over other tuning algorithms.
+    While most tools only predict the optimal configuration, Metis gives you two outputs,
+    a prediction for the optimal configuration and a suggestion for the next trial.
+    No more guess work!

-    More algorithm information you could reference here:
-    https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
+    While most tools assume training datasets do not have noisy data,
+    Metis actually tells you if you need to resample a particular hyper-parameter.

-    Attributes
+    While most tools have problems of being exploitation-heavy,
+    Metis' search strategy balances exploration, exploitation, and (optional) resampling.
+
+    Metis belongs to the class of sequential model-based optimization (SMBO) algorithms
+    and it is based on the Bayesian Optimization framework. To model the parameter-vs-performance space,
+    Metis uses both a Gaussian Process and GMM. Since each trial can impose a high time cost,
+    Metis heavily trades inference computations with naive trials.
+    At each iteration, Metis does two tasks (refer to :footcite:t:`li2018metis` for details):
+
+
+    1. It finds the global optimal point in the Gaussian Process space.
+       This point represents the optimal configuration.
+
+    2. It identifies the next hyper-parameter candidate.
+       This is achieved by inferring the potential information gain of
+       exploration, exploitation, and resampling.
+
+    Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
+    ``quniform``, ``uniform``, ``randint``, and numerical ``choice``.
+
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.tuner.name = 'MetisTuner'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize'
+        }
+
+    Parameters
    ----------
-        optimize_mode : str
-            optimize_mode is a string that including two mode "maximize" and "minimize"
-
-        no_resampling : bool
-            True or False.
-            Should Metis consider re-sampling as part of the search strategy?
-            If you are confident that the training dataset is noise-free,
-            then you do not need re-sampling.
-
-        no_candidates : bool
-            True or False.
-            Should Metis suggest parameters for the next benchmark?
-            If you do not plan to do more benchmarks,
-            Metis can skip this step.
-
-        selection_num_starting_points : int
-            How many times Metis should try to find the global optimal in the search space?
-            The higher the number, the longer it takes to output the solution.
-
-        cold_start_num : int
-            Metis need some trial result to get cold start.
-            when the number of trial result is less than
-            cold_start_num, Metis will randomly sample hyper-parameter for trial.
-
-        exploration_probability: float
-            The probability of Metis to select parameter from exploration instead of exploitation.
+    optimize_mode : str
+        optimize_mode is a string that including two mode "maximize" and "minimize"
+
+    no_resampling : bool
+        True or False.
+        Should Metis consider re-sampling as part of the search strategy?
+        If you are confident that the training dataset is noise-free,
+        then you do not need re-sampling.
+
+    no_candidates : bool
+        True or False.
+        Should Metis suggest parameters for the next benchmark?
+        If you do not plan to do more benchmarks,
+        Metis can skip this step.
+
+    selection_num_starting_points : int
+        How many times Metis should try to find the global optimal in the search space?
+        The higher the number, the longer it takes to output the solution.
+
+    cold_start_num : int
+        Metis need some trial result to get cold start.
+        when the number of trial result is less than
+        cold_start_num, Metis will randomly sample hyper-parameter for trial.
+
+    exploration_probability: float
+        The probability of Metis to select parameter from exploration instead of exploitation.
    """

    def __init__(
@@ -89,43 +124,6 @@ class MetisTuner(Tuner):
            selection_num_starting_points=600,
            cold_start_num=10,
            exploration_probability=0.9):
-        """
-        Parameters
-        ----------
-        optimize_mode : str
-            optimize_mode is a string that including two mode "maximize" and "minimize"
-
-        no_resampling : bool
-            True or False.
-            Should Metis consider re-sampling as part of the search strategy?
-            If you are confident that the training dataset is noise-free,
-            then you do not need re-sampling.
-
-        no_candidates : bool
-            True or False.
-            Should Metis suggest parameters for the next benchmark?
-            If you do not plan to do more benchmarks,
-            Metis can skip this step.
-
-        selection_num_starting_points : int
-            How many times Metis should try to find the global optimal in the search space?
-            The higher the number, the longer it takes to output the solution.
-
-        cold_start_num : int
-            Metis need some trial result to get cold start.
-            when the number of trial result is less than
-            cold_start_num, Metis will randomly sample hyper-parameter for trial.
-
-        exploration_probability : float
-            The probability of Metis to select parameter from exploration instead of exploitation.
-
-        x_bounds : list
-            The constration of parameters.
-
-        x_types : list
-            The type of parameters.
-        """
-
        self.samples_x = []
        self.samples_y = []
        self.samples_y_aggregation = []
@@ -141,7 +139,9 @@ class MetisTuner(Tuner):
        self.minimize_constraints_fun = None
        self.minimize_starting_points = None
        self.supplement_data_num = 0
+        # The constration of parameters
        self.x_bounds = []
+        # The type of parameters
        self.x_types = []



--- a/nni/algorithms/hpo/networkmorphism_tuner/__init__.py
+++ b/nni/algorithms/hpo/networkmorphism_tuner/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from .networkmorphism_tuner import NetworkMorphismTuner, NetworkMorphismClassArgsValidator