[doc] refactor of tuner docs (#4640)

f355b956 · QuanluZhang · GitHub · fbee0df1 · f355b956 · f355b956
Unverified Commit f355b956 authored Mar 21, 2022 by QuanluZhang Committed by GitHub Mar 21, 2022
7 changed files
--- a/docs/source/experiment/shared_storage.rst
+++ b/docs/source/experiment/shared_storage.rst
-**How to Use Shared Storage**
-=============================
+How to Use Shared Storage
+=========================

 If you want to use your own storage during using NNI, shared storage can satisfy you.
 Instead of using training service native storage, shared storage can bring you more convenience.

--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -8,6 +8,43 @@
  year={2011}
 }

+@inproceedings{li2018metis,
+  title={Metis: Robustly tuning tail latencies of cloud systems},
+  author={Li, Zhao Lucis and Liang, Chieh-Jan Mike and He, Wenjia and Zhu, Lianjie and Dai, Wenjun and Jiang, Jin and Sun, Guangzhong},
+  booktitle={2018 USENIX Annual Technical Conference (USENIX ATC 18)},
+  pages={981--992},
+  year={2018}
+}
+
+@inproceedings{hutter2011sequential,
+  title={Sequential model-based optimization for general algorithm configuration},
+  author={Hutter, Frank and Hoos, Holger H and Leyton-Brown, Kevin},
+  booktitle={International conference on learning and intelligent optimization},
+  pages={507--523},
+  year={2011},
+  organization={Springer}
+}
+
+@article{li2017hyperband,
+  title={Hyperband: A novel bandit-based approach to hyperparameter optimization},
+  author={Li, Lisha and Jamieson, Kevin and DeSalvo, Giulia and Rostamizadeh, Afshin and Talwalkar, Ameet},
+  journal={The Journal of Machine Learning Research},
+  volume={18},
+  number={1},
+  pages={6765--6816},
+  year={2017},
+  publisher={JMLR. org}
+}
+
+@inproceedings{falkner2018bohb,
+  title={BOHB: Robust and efficient hyperparameter optimization at scale},
+  author={Falkner, Stefan and Klein, Aaron and Hutter, Frank},
+  booktitle={International Conference on Machine Learning},
+  pages={1437--1446},
+  year={2018},
+  organization={PMLR}
+}
+
 /* NAS */

 @inproceedings{zoph2017neural,

--- a/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
+++ b/nni/algorithms/hpo/bohb_advisor/bohb_advisor.py
@@ -249,20 +249,52 @@ class BOHBClassArgsValidator(ClassArgsValidator):

 class BOHB(MsgDispatcherBase):
    """
-    BOHB performs robust and efficient hyperparameter optimization
-    at scale by combining the speed of Hyperband searches with the
-    guidance and guarantees of convergence of Bayesian Optimization.
-    Instead of sampling new configurations at random, BOHB uses
-    kernel density estimators to select promising candidates.
+    `BOHB <https://arxiv.org/abs/1807.01774>`__ is a robust and efficient hyperparameter tuning algorithm at scale.
+    BO is an abbreviation for "Bayesian Optimization" and HB is an abbreviation for "Hyperband".
+
+    BOHB relies on HB (Hyperband) to determine how many configurations to evaluate with which budget,
+    but it replaces the random selection of configurations at the beginning of each HB iteration
+    by a model-based search (Bayesian Optimization).
+    Once the desired number of configurations for the iteration is reached,
+    the standard successive halving procedure is carried out using these configurations.
+    It keeps track of the performance of all function evaluations g(x, b) of configurations x
+    on all budgets b to use as a basis for our models in later iterations.
+    Please refer to the paper :footcite:t:`falkner2018bohb` for detailed algorithm.
+
+    Note that BOHB needs additional installation using the following command:
+
+    .. code-block:: bash
+
+        pip install nni[BOHB]
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.advisor.name = 'BOHB'
+        config.advisor.class_args = {
+            'optimize_mode': 'maximize',
+            'min_budget': 1,
+            'max_budget': 27,
+            'eta': 3,
+            'min_points_in_model': 7,
+            'top_n_percent': 15,
+            'num_samples': 64,
+            'random_fraction': 0.33,
+            'bandwidth_factor': 3.0,
+            'min_bandwidth': 0.001
+        }

    Parameters
    ----------
    optimize_mode: str
-        optimize mode, 'maximize' or 'minimize'
+        Optimize mode, 'maximize' or 'minimize'.
    min_budget: float
-        The smallest budget to consider. Needs to be positive!
+        The smallest budget to assign to a trial job, (budget can be the number of mini-batches or epochs).
+        Needs to be positive.
    max_budget: float
-        The largest budget to consider. Needs to be larger than min_budget!
+        The largest budget to assign to a trial job. Needs to be larger than min_budget.
        The budgets will be geometrically distributed
        :math:`a^2 + b^2 = c^2 \\sim \\eta^k` for :math:`k\\in [0, 1, ... , num\\_subsets - 1]`.
    eta: int
@@ -271,21 +303,102 @@ class BOHB(MsgDispatcherBase):
        1/eta of them 'advances' to the next round.
        Must be greater or equal to 2.
    min_points_in_model: int
-        number of observations to start building a KDE. Default 'None' means
-        dim+1, the bare minimum.
+        Number of observations to start building a KDE. Default 'None' means dim+1;
+        when the number of completed trials in this budget is equal to or larger than ``max{dim+1, min_points_in_model}``,
+        BOHB will start to build a KDE model of this budget then use said KDE model to guide configuration selection.
+        Needs to be positive. (dim means the number of hyperparameters in search space)
    top_n_percent: int
-        percentage ( between 1 and 99, default 15) of the observations that are considered good.
+        Percentage (between 1 and 99, default 15) of the observations which are considered good.
+        Good points and bad points are used for building KDE models.
+        For example, if you have 100 observed trials and top_n_percent is 15,
+        then the top 15% of points will be used for building the good points models "l(x)".
+        The remaining 85% of points will be used for building the bad point models "g(x)".
    num_samples: int
-        number of samples to optimize EI (default 64)
+        Number of samples to optimize EI (default 64).
+        In this case, it will sample "num_samples" points and compare the result of l(x)/g(x).
+        Then it will return the one with the maximum l(x)/g(x) value as the next configuration
+        if the optimize_mode is ``maximize``. Otherwise, it returns the smallest one.
    random_fraction: float
-    fraction of purely random configurations that are sampled from the
-        prior without the model.
+        Fraction of purely random configurations that are sampled from the prior without the model.
    bandwidth_factor: float
-        to encourage diversity, the points proposed to optimize EI, are sampled
-        from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3)
+        To encourage diversity, the points proposed to optimize EI are sampled
+        from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3).
+        It is suggested to use the default value if you are not familiar with KDE.
    min_bandwidth: float
-        to keep diversity, even when all (good) samples have the same value for one of the parameters,
-        a minimum bandwidth (Default: 1e-3) is used instead of zero.
+        To keep diversity, even when all (good) samples have the same value for one of the parameters,
+        a minimum bandwidth (default: 1e-3) is used instead of zero.
+        It is suggested to use the default value if you are not familiar with KDE.
+    config_space: str
+        Directly use a .pcs file serialized by `ConfigSpace <https://automl.github.io/ConfigSpace/>` in "pcs new" format.
+        In this case, search space file (if provided in config) will be ignored.
+        Note that this path needs to be an absolute path. Relative path is currently not supported.
+
+    Notes
+    -----
+
+    Below is the introduction of the BOHB process separated in two parts:
+
+    **The first part HB (Hyperband).**
+    BOHB follows Hyperband’s way of choosing the budgets and continue to use SuccessiveHalving.
+    For more details, you can refer to the :class:`nni.algorithms.hpo.hyperband_advisor.Hyperband`
+    and the `reference paper for Hyperband <https://arxiv.org/abs/1603.06560>`__.
+    This procedure is summarized by the pseudocode below.
+
+    .. image:: ../../img/bohb_1.png
+        :scale: 80 %
+        :align: center
+
+    **The second part BO (Bayesian Optimization)**
+    The BO part of BOHB closely resembles TPE with one major difference:
+    It opted for a single multidimensional KDE compared to the hierarchy of one-dimensional KDEs used in TPE
+    in order to better handle interaction effects in the input space.
+    Tree Parzen Estimator(TPE): uses a KDE (kernel density estimator) to model the densities.
+
+    .. image:: ../../img/bohb_2.png
+        :scale: 80 %
+        :align: center
+
+    To fit useful KDEs, we require a minimum number of data points Nmin;
+    this is set to d + 1 for our experiments, where d is the number of hyperparameters.
+    To build a model as early as possible, we do not wait until Nb = \|Db\|,
+    where the number of observations for budget b is large enough to satisfy q · Nb ≥ Nmin.
+    Instead, after initializing with Nmin + 2 random configurations, we choose the
+    best and worst configurations, respectively, to model the two densities.
+    Note that it also samples a constant fraction named **random fraction** of the configurations uniformly at random.
+
+    .. image:: ../../img/bohb_3.png
+        :scale: 80 %
+        :align: center
+
+
+    .. image:: ../../img/bohb_6.jpg
+        :scale: 65 %
+        :align: center
+
+    **The above image shows the workflow of BOHB.**
+    Here set max_budget = 9, min_budget = 1, eta = 3, others as default.
+    In this case, s_max = 2, so we will continuously run the {s=2, s=1, s=0, s=2, s=1, s=0, ...} cycle.
+    In each stage of SuccessiveHalving (the orange box), it will pick the top 1/eta configurations and run them again with more budget,
+    repeating the SuccessiveHalving stage until the end of this iteration.
+    At the same time, it collects the configurations, budgets and final metrics of each trial
+    and use these to build a multidimensional KDEmodel with the key "budget".
+    Multidimensional KDE is used to guide the selection of configurations for the next iteration.
+    The sampling procedure (using Multidimensional KDE to guide selection) is summarized by the pseudocode below.
+
+    .. image:: ../../img/bohb_4.png
+        :scale: 80 %
+        :align: center
+
+    **Here is a simple experiment which tunes MNIST with BOHB.**
+    Code implementation: :githublink:`examples/trials/mnist-advisor <examples/trials/mnist-advisor>`
+    The following is the experimental final results:
+
+    .. image:: ../../img/bohb_5.png
+        :scale: 80 %
+        :align: center
+
+    More experimental results can be found in the `reference paper <https://arxiv.org/abs/1807.01774>`__.
+    It shows that BOHB makes good use of previous results and has a balanced trade-off in exploration and exploitation.
    """

    def __init__(self,

--- a/nni/algorithms/hpo/gp_tuner/gp_tuner.py
+++ b/nni/algorithms/hpo/gp_tuner/gp_tuner.py
@@ -41,29 +41,77 @@ class GPClassArgsValidator(ClassArgsValidator):

 class GPTuner(Tuner):
    """
-    GPTuner is a Bayesian Optimization method where Gaussian Process is used for modeling loss functions.
+    GPTuner is a Bayesian Optimization method where Gaussian Process
+    is used for modeling loss functions.
+
+    Bayesian optimization works by constructing a posterior distribution of functions
+    (a Gaussian Process) that best describes the function you want to optimize.
+    As the number of observations grows, the posterior distribution improves,
+    and the algorithm becomes more certain of which regions in parameter space
+    are worth exploring and which are not.
+
+    GPTuner is designed to minimize/maximize the number of steps required to find
+    a combination of parameters that are close to the optimal combination.
+    To do so, this method uses a proxy optimization problem (finding the maximum of
+    the acquisition function) that, albeit still a hard problem, is cheaper
+    (in the computational sense) to solve, and it's amenable to common tools.
+    Therefore, Bayesian Optimization is suggested for situations where sampling the function
+    to be optimized is very expensive.
+
+    Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
+    ``randint``, ``uniform``, ``quniform``, ``loguniform``, ``qloguniform``, and numerical ``choice``.
+
+    This optimization approach is described in Section 3 of the paper
+    `Algorithms for Hyper-Parameter Optimization <https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf>`__
+    ( :footcite:t:`bergstra2011algorithms` ).
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.tuner.name = 'GPTuner'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize',
+            'utility': 'ei',
+            'kappa': 5.0,
+            'xi': 0.0,
+            'nu': 2.5,
+            'alpha': 1e-6,
+            'cold_start_num': 10,
+            'selection_num_warm_up': 100000,
+            'selection_num_starting_points': 250
+        }

    Parameters
    ----------
    optimize_mode : str
-        optimize mode, 'maximize' or 'minimize', by default 'maximize'
+        Optimize mode, 'maximize' or 'minimize', by default 'maximize'
    utility : str
-        utility function (also called 'acquisition funcition') to use, which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
+        Utility function (also called 'acquisition funcition') to use,
+        which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
    kappa : float
-        value used by utility function 'ucb'. The bigger kappa is, the more the tuner will be exploratory. By default 5.
+        Value used by utility function 'ucb'. The bigger kappa is,
+        the more the tuner will be exploratory. By default 5.
    xi : float
-        used by utility function 'ei' and 'poi'. The bigger xi is, the more the tuner will be exploratory. By default 0.
+        Used by utility function 'ei' and 'poi'. The bigger xi is,
+        the more the tuner will be exploratory. By default 0.
    nu : float
-        used to specify Matern kernel. The smaller nu, the less smooth the approximated function is. By default 2.5.
+        Used to specify Matern kernel. The smaller nu,
+        the less smooth the approximated function is. By default 2.5.
    alpha : float
-        Used to specify Gaussian Process Regressor. Larger values correspond to increased noise level in the observations.
+        Used to specify Gaussian Process Regressor.
+        Larger values correspond to increased noise level in the observations.
        By default 1e-6.
    cold_start_num : int
-        Number of random exploration to perform before Gaussian Process. By default 10.
+        Number of random exploration to perform before Gaussian Process.
+        By default 10.
    selection_num_warm_up : int
-        Number of random points to evaluate for getting the point which maximizes the acquisition function. By default 100000
+        Number of random points to evaluate for getting the point which
+        maximizes the acquisition function. By default 100000
    selection_num_starting_points : int
-        Number of times to run L-BFGS-B from a random starting point after the warmup. By default 250.
+        Number of times to run L-BFGS-B from a random starting point after the warmup.
+        By default 250.
    """

    def __init__(self, optimize_mode="maximize", utility='ei', kappa=5, xi=0, nu=2.5, alpha=1e-6, cold_start_num=10,

--- a/nni/algorithms/hpo/hyperband_advisor.py
+++ b/nni/algorithms/hpo/hyperband_advisor.py
@@ -105,7 +105,8 @@ def json2parameter(ss_spec, random_state):


 class Bracket():
-    """A bracket in Hyperband, all the information of a bracket is managed by an instance of this class
+    """
+    A bracket in Hyperband, all the information of a bracket is managed by an instance of this class

    Parameters
    ----------
@@ -267,24 +268,136 @@ class HyperbandClassArgsValidator(ClassArgsValidator):

 class Hyperband(MsgDispatcherBase):
    """
-    Hyperband inherit from MsgDispatcherBase rather than Tuner, because it integrates both tuner's functions and assessor's functions.
-    This is an implementation that could fully leverage available resources or follow the algorithm process,
-    i.e., high parallelism or serial.
-    A single execution of Hyperband takes a finite budget of (s_max + 1)B.
+    `Hyperband <https://arxiv.org/pdf/1603.06560.pdf>`__ is a multi-fidelity hyperparameter tuning algorithm
+    based on successive halving.
+
+    The basic idea of Hyperband is to create several buckets,
+    each having ``n`` randomly generated hyperparameter configurations,
+    each configuration using ``r`` resources (e.g., epoch number, batch number).
+    After the ``n`` configurations are finished, it chooses the top ``n/eta`` configurations
+    and runs them using increased ``r*eta`` resources.
+    At last, it chooses the best configuration it has found so far.
+    Please refer to the paper :footcite:t:`li2017hyperband` for detailed algorithm.
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.advisor.name = 'Hyperband'
+        config.advisor.class_args = {
+            'optimize_mode': 'maximize',
+            'R': 60,
+            'eta': 3
+        }
+
+
+    Note that once you use Advisor, you are not allowed to add a Tuner and Assessor spec in the config file.
+    When Hyperband is used, the dict returned by :func:`nni.get_next_parameter` one more key
+    called ``TRIAL_BUDGET`` besides the hyperparameters and their values.
+    **With this TRIAL_BUDGET, users can control in trial code how long a trial runs by following
+    the suggested trial budget from Hyperband.** ``TRIAL_BUDGET`` is a relative number,
+    users can interpret them as number of epochs, number of mini-batches, running time, etc.
+
+    Here is a concrete example of ``R=81`` and ``eta=3``:
+
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+
+        * -
+          - s=4
+          - s=3
+          - s=2
+          - s=1
+          - s=0
+        * - i
+          - n r
+          - n r
+          - n r
+          - n r
+          - n r
+        * - 0
+          - 81 1
+          - 27 3
+          - 9 9
+          - 6 27
+          - 5 81
+        * - 1
+          - 27 3
+          - 9 9
+          - 3 27
+          - 2 81
+          -
+        * - 2
+          - 9 9
+          - 3 27
+          - 1 81
+          -
+          -
+        * - 3
+          - 3 27
+          - 1 81
+          -
+          -
+          -
+        * - 4
+          - 1 81
+          -
+          -
+          -
+          -
+
+
+    ``s`` means bucket, ``n`` means the number of configurations that are generated,
+    the corresponding ``r`` means how many budgets these configurations run.
+    ``i`` means round, for example, bucket 4 has 5 rounds, bucket 3 has 4 rounds.
+
+    A complete example can be found :githublink:`examples/trials/mnist-advisor`.

    Parameters
    ----------
+    optimize_mode: str
+        Optimize mode, 'maximize' or 'minimize'.
+
    R: int
-        the maximum amount of resource that can be allocated to a single configuration
+        The maximum amount of budget that can be allocated to a single configuration.
+        Here, trial budget could mean the number of epochs, number of mini-batches, etc.,
+        depending on how users interpret it.
+        Each trial should use ``TRIAL_BUDGET`` to control how long it runs.
+
    eta: int
-        the variable that controls the proportion of configurations discarded in each round of SuccessiveHalving
-    optimize_mode: str
-        optimize mode, 'maximize' or 'minimize'
+        The variable that controls the proportion of configurations discarded in each round of SuccessiveHalving.
+        ``1/eta`` configurations will survive and rerun using more budgets in each round.
+
    exec_mode: str
-        execution mode, 'serial' or 'parallelism'
+        Execution mode, 'serial' or 'parallelism'.
+        If 'parallelism', the tuner will try to use available resources to start new bucket immediately.
+        If 'serial', the tuner will only start new bucket after the current bucket is done.
+
+
+    Notes
+    -----
+
+    First, Hyperband an example of how to write an autoML algorithm based on MsgDispatcherBase,
+    rather than based on Tuner and Assessor. Hyperband is implemented in this way
+    because it integrates the functions of both Tuner and Assessor,thus, we call it Advisor.
+
+    Second, this implementation fully leverages Hyperband's internal parallelism.
+    Specifically, the next bucket is not started strictly after the current bucket.
+    Instead, it starts when there are available resources. If you want to use full parallelism mode,
+    set ``exec_mode`` to ``parallelism``.
+
+    Or if you want to set ``exec_mode`` with ``serial`` according to the original algorithm.
+    In this mode, the next bucket will start strictly after the current bucket.
+
+    ``parallelism`` mode may lead to multiple unfinished buckets,
+    in contrast, there is at most one unfinished bucket under ``serial`` mode.
+    The advantage of ``parallelism`` mode is to make full use of resources,
+    which may reduce the experiment duration multiple times.
    """

-    def __init__(self, R=60, eta=3, optimize_mode='maximize', exec_mode='parallelism'):
+    def __init__(self, optimize_mode='maximize', R=60, eta=3, exec_mode='parallelism'):
        """B = (s_max + 1)R"""
        super(Hyperband, self).__init__()
        self.R = R

--- a/nni/algorithms/hpo/metis_tuner/metis_tuner.py
+++ b/nni/algorithms/hpo/metis_tuner/metis_tuner.py
@@ -46,12 +46,47 @@ class MetisClassArgsValidator(ClassArgsValidator):

 class MetisTuner(Tuner):
    """
-    Metis Tuner
+    `Metis tuner <https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/>`__ offers
+    several benefits over other tuning algorithms.
+    While most tools only predict the optimal configuration, Metis gives you two outputs,
+    a prediction for the optimal configuration and a suggestion for the next trial.
+    No more guess work!

-    More algorithm information you could reference here:
-    https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
+    While most tools assume training datasets do not have noisy data,
+    Metis actually tells you if you need to resample a particular hyper-parameter.

-    Attributes
+    While most tools have problems of being exploitation-heavy,
+    Metis' search strategy balances exploration, exploitation, and (optional) resampling.
+
+    Metis belongs to the class of sequential model-based optimization (SMBO) algorithms
+    and it is based on the Bayesian Optimization framework. To model the parameter-vs-performance space,
+    Metis uses both a Gaussian Process and GMM. Since each trial can impose a high time cost,
+    Metis heavily trades inference computations with naive trials.
+    At each iteration, Metis does two tasks (refer to :footcite:t:`li2018metis` for details):
+
+
+    1. It finds the global optimal point in the Gaussian Process space.
+       This point represents the optimal configuration.
+
+    2. It identifies the next hyper-parameter candidate.
+       This is achieved by inferring the potential information gain of
+       exploration, exploitation, and resampling.
+
+    Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
+    ``quniform``, ``uniform``, ``randint``, and numerical ``choice``.
+
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.tuner.name = 'MetisTuner'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize'
+        }
+
+    Parameters
    ----------
    optimize_mode : str
        optimize_mode is a string that including two mode "maximize" and "minimize"
@@ -89,43 +124,6 @@ class MetisTuner(Tuner):
            selection_num_starting_points=600,
            cold_start_num=10,
            exploration_probability=0.9):
-        """
-        Parameters
-        ----------
-        optimize_mode : str
-            optimize_mode is a string that including two mode "maximize" and "minimize"
-
-        no_resampling : bool
-            True or False.
-            Should Metis consider re-sampling as part of the search strategy?
-            If you are confident that the training dataset is noise-free,
-            then you do not need re-sampling.
-
-        no_candidates : bool
-            True or False.
-            Should Metis suggest parameters for the next benchmark?
-            If you do not plan to do more benchmarks,
-            Metis can skip this step.
-
-        selection_num_starting_points : int
-            How many times Metis should try to find the global optimal in the search space?
-            The higher the number, the longer it takes to output the solution.
-
-        cold_start_num : int
-            Metis need some trial result to get cold start.
-            when the number of trial result is less than
-            cold_start_num, Metis will randomly sample hyper-parameter for trial.
-
-        exploration_probability : float
-            The probability of Metis to select parameter from exploration instead of exploitation.
-
-        x_bounds : list
-            The constration of parameters.
-
-        x_types : list
-            The type of parameters.
-        """
-
        self.samples_x = []
        self.samples_y = []
        self.samples_y_aggregation = []
@@ -141,7 +139,9 @@ class MetisTuner(Tuner):
        self.minimize_constraints_fun = None
        self.minimize_starting_points = None
        self.supplement_data_num = 0
+        # The constration of parameters
        self.x_bounds = []
+        # The type of parameters
        self.x_types = []



--- a/nni/algorithms/hpo/smac_tuner/smac_tuner.py
+++ b/nni/algorithms/hpo/smac_tuner/smac_tuner.py
@@ -38,12 +38,36 @@ class SMACClassArgsValidator(ClassArgsValidator):

 class SMACTuner(Tuner):
    """
-    This is a wrapper of [SMAC](https://github.com/automl/SMAC3) following NNI tuner interface.
-    It only supports ``SMAC`` mode, and does not support the multiple instances of SMAC3 (i.e.,
-    the same configuration is run multiple times).
-    """
-    def __init__(self, optimize_mode="maximize", config_dedup=False):
-        """
+    `SMAC <https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf>`__ is based on Sequential Model-Based Optimization (SMBO).
+    It adapts the most prominent previously used model class (Gaussian stochastic process models)
+    and introduces the model class of random forests to SMBO in order to handle categorical parameters.
+
+    The SMAC supported by nni is a wrapper on `the SMAC3 github repo <https://github.com/automl/SMAC3>`__,
+    following NNI tuner interface :class:`nni.tuner.Tuner`. For algorithm details of SMAC, please refer to the paper
+    :footcite:t:`hutter2011sequential`.
+
+    Note that SMAC on nni only supports a subset of the types in
+    :doc:`search space </hpo/search_space>`:
+    ``choice``, ``randint``, ``uniform``, ``loguniform``, and ``quniform``.
+
+    Note that SMAC needs additional installation using the following command:
+
+    .. code-block:: bash
+
+        pip install nni[SMAC]
+
+    ``swig`` is required for SMAC. for Ubuntu ``swig`` can be installed with ``apt``.
+
+    Examples
+    --------
+
+    .. code-block::
+
+        config.tuner.name = 'SMAC'
+        config.tuner.class_args = {
+            'optimize_mode': 'maximize'
+        }
+
    Parameters
    ----------
    optimize_mode : str
@@ -52,6 +76,8 @@ class SMACTuner(Tuner):
        If True, the tuner will not generate a configuration that has been already generated.
        If False, a configuration may be generated twice, but it is rare for relatively large search space.
    """
+
+    def __init__(self, optimize_mode="maximize", config_dedup=False):
        self.logger = logger
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.total_data = {}