HPO doc (#4579)

b52f7756 · liuzhe-lz · GitHub · 88ffe908 · b52f7756 · b52f7756
Unverified Commit b52f7756 authored Mar 10, 2022 by liuzhe-lz Committed by GitHub Mar 10, 2022
10 changed files
--- a/docs/source/tutorials/index.rst
+++ b/docs/source/tutorials/index.rst
@@ -161,6 +161,58 @@ Tutorials
+.. _sphx_glr_tutorials_hpo_quickstart_tensorflow:
+.. raw:: html
+    <div class="sphx-glr-thumbcontainer" tooltip="The tutorial consists of 4 steps: ">
+.. only:: html
+ .. figure:: /tutorials/hpo_quickstart_tensorflow/images/thumb/sphx_glr_main_thumb.png
+     :alt: NNI HPO Quickstart with TensorFlow
+     :ref:`sphx_glr_tutorials_hpo_quickstart_tensorflow_main.py`
+.. raw:: html
+    </div>
+.. toctree::
+   :hidden:
+   /tutorials/hpo_quickstart_tensorflow/main
+.. raw:: html
+    <div class="sphx-glr-thumbcontainer" tooltip="It can be run directly and will have the exact same result as original version.">
+.. only:: html
+ .. figure:: /tutorials/hpo_quickstart_tensorflow/images/thumb/sphx_glr_model_thumb.png
+     :alt: Port TensorFlow Quickstart to NNI
+     :ref:`sphx_glr_tutorials_hpo_quickstart_tensorflow_model.py`
+.. raw:: html
+    </div>
+.. toctree::
+   :hidden:
+   /tutorials/hpo_quickstart_tensorflow/model
+.. raw:: html
+    <div class="sphx-glr-clear"></div>
 .. only:: html
 .. rst-class:: sphx-glr-signature

--- a/examples/tutorials/hpo_quickstart_tensorflow/README.rst
+++ b/examples/tutorials/hpo_quickstart_tensorflow/README.rst
--- a/examples/tutorials/hpo_quickstart_tensorflow/main.py
+++ b/examples/tutorials/hpo_quickstart_tensorflow/main.py
+"""
+NNI HPO Quickstart with TensorFlow
+==================================
+This tutorial optimizes the model in `official TensorFlow quickstart`_ with auto-tuning.
+The tutorial consists of 4 steps: 
+ 1. Modify the model for auto-tuning.
+ 2. Define hyperparameters' search space.
+ 3. Configure the experiment.
+ 4. Run the experiment.
+.. _official TensorFlow quickstart: https://www.tensorflow.org/tutorials/quickstart/beginner
+"""
+# %%
+# Step 1: Prepare the model
+# -------------------------
+# In first step, you need to prepare the model to be tuned.
+#
+# The model should be put in a separate script.
+# It will be evaluated many times concurrently,
+# and possibly will be trained on distributed platforms.
+#
+# In this tutorial, the model is defined in :doc:`model.py <model>`.
+#
+# Please understand the model code before continue to next step.
+# %%
+# Step 2: Define search space
+# ---------------------------
+# In model code, we have prepared 4 hyperparameters to be tuned:
+# *dense_units*, *activation_type*, *dropout_rate*, and *learning_rate*.
+#
+# Here we need to define their *search space* so the tuning algorithm can sample them in desired range.
+#
+# Assuming we have following prior knowledge for these hyperparameters:
+#
+#  1. *dense_units* should be one of 64, 128, 256.
+#  2. *activation_type* should be one of 'relu', 'tanh', 'swish', or None.
+#  3. *dropout_rate* should be a float between 0.5 and 0.9.
+#  4. *learning_rate* should be a float between 0.0001 and 0.1, and it follows exponential distribution.
+#
+# In NNI, the space of *dense_units* and *activation_type* is called ``choice``;
+# the space of *dropout_rate* is called ``uniform``;
+# and the space of *learning_rate* is called ``loguniform``.
+# You may have noticed, these names are derived from ``numpy.random``.
+#
+# For full specification of search space, check :doc:`the reference </hpo/search_space>`.
+#
+# Now we can define the search space as follow:
+search_space = {
+    'dense_units': {'_type': 'choice', '_value': [64, 128, 256]},
+    'activation_type': {'_type': 'choice', '_value': ['relu', 'tanh', 'swish', None]},
+    'dropout_rate': {'_type': 'uniform', '_value': [0.5, 0.9]},
+    'learning_rate': {'_type': 'loguniform', '_value': [0.0001, 0.1]},
+}
+# %%
+# Step 3: Configure the experiment
+# --------------------------------
+# NNI uses an *experiment* to manage the HPO process.
+# The *experiment config* defines how to train the models and how to explore the search space.
+# 
+# In this tutorial we use a *local* mode experiment,
+# which means models will be trained on local machine, without using any special training platform.
+from nni.experiment import Experiment
+experiment = Experiment('local')
+# %%
+# Now we start to configure the experiment.
+#
+# Firstly, specify the model code.
+# In NNI evaluation of each hyperparameter set is called a *trial*.
+# So the model script is called *trial code*.
+#
+# If you are using Linux system without Conda, you many need to change ``python`` to ``python3``.
+#
+# When ``trial_code_directory`` is a relative path, it relates to current working directory.
+# To run ``main.py`` from a different path, you can set trial code directory to ``Path(__file__).parent``.
+experiment.config.trial_command = 'python model.py'
+experiment.config.trial_code_directory = '.'
+# %%
+# Then specify the search space we defined above:
+experiment.config.search_space = search_space
+# %%
+# Choose a tuning algorithm.
+# Here we use :doc:`TPE tuner </hpo/tuners>`.
+experiment.config.tuner.name = 'TPE'
+experiment.config.tuner.class_args['optimize_mode'] = 'maximize'
+# %%
+# Specify how many trials to run.
+# Here we evaluate 10 sets of hyperparameters in total, and concurrently evaluate 4 sets at a time.
+#
+# Please note that ``max_trial_number`` here is merely for a quick example.
+# With default config TPE tuner requires 20 trials to warm up.
+# In real world max trial number is commonly set to 100+.
+#
+# You can also set ``max_experiment_duration = '1h'`` to limit running time.
+#
+# And alternatively, you can skip this part and set no limit at all.
+# The experiment will run forever until you press Ctrl-C.
+experiment.config.max_trial_number = 10
+experiment.config.trial_concurrency = 4
+# %%
+# Step 4: Run the experiment
+# --------------------------
+# Now the experiment is ready. Choose a port and launch it.
+#
+# You can use the web portal to view experiment status: http://localhost:8080.
+experiment.run(8080)
--- a/examples/tutorials/hpo_quickstart_tensorflow/model.py
+++ b/examples/tutorials/hpo_quickstart_tensorflow/model.py
+"""
+Port TensorFlow Quickstart to NNI
+=================================
+This is a modified version of `TensorFlow quickstart`_.
+It can be run directly and will have the exact same result as original version.
+Furthermore, it enables the ability of auto-tuning with an NNI *experiment*, which will be discussed later.
+For now, we recommend to run this script directly to verify the environment.
+There are only 3 key differences from the original version:
+ 1. In `Get optimized hyperparameters`_ part, it receives auto-generated hyperparameters.
+ 2. In `(Optional) Report intermediate results`_ part, it reports per-epoch accuracy for visualization.
+ 3. In `Report final result`_ part, it reports final accuracy for tuner to generate next hyperparameter set.
+.. _TensorFlow quickstart: https://www.tensorflow.org/tutorials/quickstart/beginner
+"""
+# %%
+import nni
+import tensorflow as tf
+# %%
+# Hyperparameters to be tuned
+# ---------------------------
+params = {
+    'dense_units': 128,
+    'activation_type': 'relu',
+    'dropout_rate': 0.2,
+    'learning_rate': 0.001,
+}
+# %%
+# Get optimized hyperparameters
+# -----------------------------
+# If run directly, ``nni.get_next_parameters()`` is a no-op and returns an empty dict.
+# But with an NNI *experiment*, it will receive optimized hyperparameters from tuning algorithm.
+optimized_params = nni.get_next_parameter()
+params.update(optimized_params)
+# %%
+# Load dataset
+# ------------
+mnist = tf.keras.datasets.mnist
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+x_train, x_test = x_train / 255.0, x_test / 255.0
+# %%
+# Build model with hyperparameters
+# --------------------------------
+model = tf.keras.models.Sequential([
+    tf.keras.layers.Flatten(input_shape=(28, 28)),
+    tf.keras.layers.Dense(params['dense_units'], activation=params['activation_type']),
+    tf.keras.layers.Dropout(params['dropout_rate']),
+    tf.keras.layers.Dense(10)
+])
+adam = tf.keras.optimizers.Adam(learning_rate=params['learning_rate'])
+loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+model.compile(optimizer=adam, loss=loss_fn, metrics=['accuracy'])
+# %%
+# (Optional) Report intermediate results
+# --------------------------------------
+# The callback reports per-epoch accuracy to show learning curve in NNI web portal.
+# And in :doc:`/hpo/assessors`, you will see how to leverage the metrics for early stopping.
+#
+# You can safely skip this and the experiment will work fine.
+callback = tf.keras.callbacks.LambdaCallback(
+    on_epoch_end = lambda epoch, logs: nni.report_intermediate_result(logs['accuracy'])
+)
+# %%
+# Train and evluate the model
+# ---------------------------
+model.fit(x_train, y_train, epochs=5, verbose=2, callbacks=[callback])
+loss, accuracy = model.evaluate(x_test, y_test, verbose=2)
+# %%
+# Report final result
+# -------------------
+# Report final accuracy to NNI so the tuning algorithm can predict best hyperparameters.
+nni.report_final_result(accuracy)
--- a/nni/algorithms/hpo/random_tuner.py
+++ b/nni/algorithms/hpo/random_tuner.py
@@ -7,6 +7,8 @@ Naive random tuner for hyper-parameter optimization.
 You can specify an integer seed to determine random result.
 """
+from __future__ import annotations
 __all__ = ['RandomTuner', 'suggest', 'suggest_parameter']
 import logging
@@ -21,7 +23,26 @@ from nni.tuner import Tuner
 _logger = logging.getLogger('nni.tuner.random')
 class RandomTuner(Tuner):
-    def __init__(self, seed=None):
+    """
+    A naive tuner that generates fully random hyperparameters.
+    Examples
+    --------
+    .. code-block::
+        config.tuner.name = 'Random'
+        config.tuner.class_args = {
+            'seed': 100
+        }
+    Parameters
+    ----------
+    seed
+        The random seed.
+    """
+    def __init__(self, seed: int | None = None):
        self.space = None
        if seed is None:  # explicitly generate a seed to make the experiment reproducible
            seed = np.random.default_rng().integers(2 ** 31)

--- a/nni/algorithms/hpo/tpe_tuner.py
+++ b/nni/algorithms/hpo/tpe_tuner.py
@@ -10,18 +10,21 @@ Official code: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/tpe.py
 This is a slightly modified re-implementation of the algorithm.
 """
+from __future__ import annotations
 __all__ = ['TpeTuner', 'TpeArguments', 'suggest', 'suggest_parameter']
 from collections import defaultdict
 import logging
 import math
-from typing import NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 import numpy as np
 from scipy.special import erf  # pylint: disable=no-name-in-module
-from nni.tuner import Tuner
 from nni.common.hpo_utils import OptimizeMode, format_search_space, deformat_parameters, format_parameters
+from nni.tuner import Tuner
+from nni.typehint import Literal
 from nni.utils import extract_scalar_reward
 from . import random_tuner
@@ -32,11 +35,11 @@ _logger = logging.getLogger('nni.tuner.tpe')
 class TpeArguments(NamedTuple):
    """
    These are the hyper-parameters of TPE algorithm itself.
-    To avoid confusing with trials' hyper-parameters, they are called "arguments" in this code.
+    To avoid confusing with trials' hyper-parameters, they are called "arguments" in TPE source code.
    Parameters
-    ==========
+    ----------
-    constant_liar_type: 'best' | 'worst' | 'mean' | None (default: 'best')
+    constant_liar_type
        TPE algorithm itself does not support parallel tuning.
        This parameter specifies how to optimize for trial_concurrency > 1.
@@ -44,20 +47,21 @@ class TpeArguments(NamedTuple):
        How each liar works is explained in paper's section 6.1.
        In general "best" suit for small trial number and "worst" suit for large trial number.
+        (:doc:`experiment result </CommunitySharings/ParallelizingTpeSearch>`)
-    n_startup_jobs: int (default: 20)
+    n_startup_jobs
        The first N hyper-parameters are generated fully randomly for warming up.
        If the search space is large, you can increase this value.
        Or if max_trial_number is small, you may want to decrease it.
-    n_ei_candidates: int (default: 24)
+    n_ei_candidates
        For each iteration TPE samples EI for N sets of parameters and choose the best one. (loosely speaking)
-    linear_forgetting: int (default: 25)
+    linear_forgetting
        TPE will lower the weights of old trials.
        This controls how many iterations it takes for a trial to start decay.
-    prior_weight: float (default: 1.0)
+    prior_weight
        TPE treats user provided search space as prior.
        When generating new trials, it also incorporates the prior in trial history by transforming the search space to
        one trial configuration (i.e., each parameter of this configuration chooses the mean of its candidate range).
@@ -66,11 +70,11 @@ class TpeArguments(NamedTuple):
        With prior weight 1.0, the search space is treated as one good trial.
        For example, "normal(0, 1)" effectly equals to a trial with x = 0 which has yielded good result.
-    gamma: float (default: 0.25)
+    gamma
        Controls how many trials are considered "good".
        The number is calculated as "min(gamma * sqrt(N), linear_forgetting)".
    """
-    constant_liar_type: Optional[str] = 'best'
+    constant_liar_type: Literal['best', 'worst', 'mean'] | None = 'best'
    n_startup_jobs: int = 20
    n_ei_candidates: int = 24
    linear_forgetting: int = 25
@@ -79,18 +83,61 @@ class TpeArguments(NamedTuple):
 class TpeTuner(Tuner):
    """
+    Tree-structured Parzen Estimator (TPE) is an SMBO tuner.
+    TPE models P(x|y) and P(y) where x represents hyperparameters and y the associated evaluation metric.
+    P(x|y) is modeled by transforming the generative process of hyperparameters,
+    replacing the distributions of the configuration prior with non-parametric densities.
+    TPE is described in detail in *Algorithms for Hyper-Parameter Optimization*. (`paper`_)
+    .. _paper: https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf
+    Examples
+    --------
+    .. code-block::
+        ## minimal config ##
+        config.tuner.name = 'TPE'
+        config.tuner.class_args = {
+            'optimize_mode': 'minimize'
+        }
+    .. code-block::
+        ## advanced config ##
+        config.tuner.name = 'TPE'
+        config.tuner.class_args = {
+            'optimize_mode': maximize,
+            'seed': 12345,
+            'tpe_args': {
+                'constant_liar_type': 'mean',
+                'n_startup_jobs': 10,
+                'n_ei_candidates': 20,
+                'linear_forgetting': 100,
+                'prior_weight': 0,
+                'gamma': 0.5
+            }
+        }
    Parameters
-    ==========
+    ----------
-    optimze_mode: 'minimize' | 'maximize' (default: 'minimize')
+    optimze_mode
        Whether optimize to minimize or maximize trial result.
-    seed: int | None
+    seed
        The random seed.
-    tpe_args: dict[string, Any] | None
+    tpe_args
        Advanced users can use this to customize TPE tuner.
        See `TpeArguments` for details.
    """
-    def __init__(self, optimize_mode='minimize', seed=None, tpe_args=None):
+    def __init__(self,
+            optimize_mode: Literal['minimize', 'maximize'] = 'minimize',
+            seed: int | None = None,
+            tpe_args: dict[str, Any] | None = None):
        self.optimize_mode = OptimizeMode(optimize_mode)
        self.args = TpeArguments(**(tpe_args or {}))
        self.space = None
@@ -183,7 +230,7 @@ def suggest_parameter(args, rng, spec, parameter_history):
 ## Utilities part ##
 class Record(NamedTuple):
-    param: Union[int, float]
+    param: int | float
    loss: float
 class BestLiar:  # assume running parameters have best result, it accelerates "converging"
@@ -305,7 +352,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
    this function is used for everything other than "choice" and "randint".
    Parameters
-    ==========
+    ----------
    args: TpeArguments
        Algorithm arguments.
    history_mus: 1-d array of float
@@ -317,7 +364,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
        σ value of normal search space.
    Returns
-    =======
+    -------
    Tuple of three 1-d float arrays: (weight, µ, σ).
    The tuple represents N+1 "vicinity of observations" and each one's weight,

--- a/nni/retiarii/nn/pytorch/hypermodule.py
+++ b/nni/retiarii/nn/pytorch/hypermodule.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from packaging.version import Version
 import torch
 import torch.nn as nn
@@ -8,7 +9,6 @@ from nni.retiarii.serializer import basic_unit
 from .api import LayerChoice
 from .utils import generate_new_label
-from ...utils import version_larger_equal
 __all__ = ['AutoActivation']
@@ -99,7 +99,7 @@ class UnaryTanh(nn.Module):
    def forward(self, x):
        return torch.tanh(x)
-if not version_larger_equal(torch.__version__, TorchVersion):
+if not Version(torch.__version__) >= Version(TorchVersion):
    @basic_unit
    class UnaryAsinh(nn.Module):
        def forward(self, x):
@@ -110,7 +110,7 @@ class UnaryAtan(nn.Module):
    def forward(self, x):
        return torch.atan(x)
-if not version_larger_equal(torch.__version__, TorchVersion):
+if not Version(torch.__version__) >= Version(TorchVersion):
    @basic_unit
    class UnarySinc(nn.Module):
        def forward(self, x):
@@ -151,7 +151,7 @@ unary_modules = ['UnaryIdentity', 'UnaryNegative', 'UnaryAbs', 'UnarySquare', 'U
    'UnarySinh', 'UnaryCosh', 'UnaryTanh', 'UnaryAtan', 'UnaryMax',
    'UnaryMin', 'UnarySigmoid', 'UnaryLogExp', 'UnaryExpSquare', 'UnaryErf']
-if not version_larger_equal(torch.__version__, TorchVersion):
+if not Version(torch.__version__) >= Version(TorchVersion):
    unary_modules.append('UnaryAsinh')
    unary_modules.append('UnarySinc')

--- a/nni/retiarii/nn/pytorch/nn.py
+++ b/nni/retiarii/nn/pytorch/nn.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from packaging.version import Version
 import torch
 import torch.nn as nn
 from ...serializer import basic_unit
-from ...utils import version_larger_equal
 # NOTE: support pytorch version >= 1.5.0
@@ -31,10 +31,10 @@ __all__ = [
    'Flatten', 'Hardsigmoid'
 ]
-if version_larger_equal(torch.__version__, '1.6.0'):
+if Version(torch.__version__) >= Version('1.6.0'):
    __all__.append('Hardswish')
-if version_larger_equal(torch.__version__, '1.7.0'):
+if Version(torch.__version__) >= Version('1.7.0'):
    __all__.extend(['Unflatten', 'SiLU', 'TripletMarginWithDistanceLoss'])
@@ -149,10 +149,10 @@ Transformer = basic_unit(nn.Transformer)
 Flatten = basic_unit(nn.Flatten)
 Hardsigmoid = basic_unit(nn.Hardsigmoid)
-if version_larger_equal(torch.__version__, '1.6.0'):
+if Version(torch.__version__) >= Version('1.6.0'):
    Hardswish = basic_unit(nn.Hardswish)
-if version_larger_equal(torch.__version__, '1.7.0'):
+if Version(torch.__version__) >= Version('1.7.0'):
    SiLU = basic_unit(nn.SiLU)
    Unflatten = basic_unit(nn.Unflatten)
    TripletMarginWithDistanceLoss = basic_unit(nn.TripletMarginWithDistanceLoss)
--- a/nni/retiarii/utils.py
+++ b/nni/retiarii/utils.py
@@ -18,13 +18,6 @@ def import_(target: str, allow_none: bool = False) -> Any:
    return getattr(module, identifier)
-def version_larger_equal(a: str, b: str) -> bool:
-    # TODO: refactor later
-    a = a.split('+')[0]
-    b = b.split('+')[0]
-    return tuple(map(int, a.split('.'))) >= tuple(map(int, b.split('.')))
 _last_uid = defaultdict(int)
 _DEFAULT_MODEL_NAMESPACE = 'model'

--- a/nni/typehint.py
+++ b/nni/typehint.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import sys
+import typing
+if typing.TYPE_CHECKING or sys.version_info >= (3, 8):
+    Literal = typing.Literal
+else:
+    Literal = typing.Any