Refactor Hyperopt Tuners (Stage 3) - TPE tuner (#4239)

cb6c72ea · cruiseliu · GitHub · 3601044e · cb6c72ea · cb6c72ea
Unverified Commit cb6c72ea authored Nov 25, 2021 by cruiseliu Committed by GitHub Nov 25, 2021
6 changed files
--- a/nni/algorithms/hpo/random_tuner.py
+++ b/nni/algorithms/hpo/random_tuner.py
@@ -9,6 +9,8 @@ You can specify an integer seed to determine random result.
 __all__ = ['RandomTuner', 'suggest', 'suggest_parameter']
+import logging
 import numpy as np
 import schema
@@ -16,10 +18,15 @@ from nni import ClassArgsValidator
 from nni.common.hpo_utils import format_search_space, deformat_parameters
 from nni.tuner import Tuner
+_logger = logging.getLogger('nni.tuner.random')
 class RandomTuner(Tuner):
    def __init__(self, seed=None):
        self.space = None
+        if seed is None:  # explicitly generate a seed to make the experiment reproducible
+            seed = np.random.default_rng().integers(2 ** 31)
        self.rng = np.random.default_rng(seed)
+        _logger.info(f'Using random seed {seed}')
    def update_search_space(self, space):
        self.space = format_search_space(space)

--- a/nni/algorithms/hpo/tpe_tuner.py
+++ b/nni/algorithms/hpo/tpe_tuner.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""
+Tree-structured Parzen Estimator (TPE) tuner for hyper-parameter optimization.
+Paper: https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf
+Official code: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/tpe.py
+This is a slightly modified re-implementation of the algorithm.
+"""
+__all__ = ['TpeTuner', 'TpeArguments', 'suggest', 'suggest_parameter']
+from collections import defaultdict
+import logging
+import math
+from typing import NamedTuple, Optional, Union
+import numpy as np
+from scipy.special import erf  # pylint: disable=no-name-in-module
+from nni.tuner import Tuner
+from nni.common.hpo_utils import OptimizeMode, format_search_space, deformat_parameters, format_parameters
+from . import random_tuner
+_logger = logging.getLogger('nni.tuner.tpe')
+## Public API part ##
+class TpeArguments(NamedTuple):
+    """
+    These are the hyper-parameters of TPE algorithm itself.
+    To avoid confusing with trials' hyper-parameters, they are called "arguments" in this code.
+    Parameters
+    ==========
+    constant_liar_type: 'best' | 'worst' | 'mean' | None (default: 'best')
+        TPE algorithm itself does not support parallel tuning.
+        This parameter specifies how to optimize for trial_concurrency > 1.
+        None (or "null" in YAML) means do not optimize. This is the default behavior in legacy version.
+        How each liar works is explained in paper's section 6.1.
+        In general "best" suit for small trial number and "worst" suit for large trial number.
+    n_startup_jobs: int (default: 20)
+        The first N hyper-parameters are generated fully randomly for warming up.
+        If the search space is large, you can increase this value.
+        Or if max_trial_number is small, you may want to decrease it.
+    n_ei_candidates: int (default: 24)
+        For each iteration TPE samples EI for N sets of parameters and choose the best one. (loosely speaking)
+    linear_forgetting: int (default: 25)
+        TPE will lower the weights of old trials.
+        This controls how many iterations it takes for a trial to start decay.
+    prior_weight: float (default: 1.0)
+        TPE treats user provided search space as prior.
+        When generating new trials, it also incorporates the prior in trial history by transforming the search space to
+        one trial configuration (i.e., each parameter of this configuration chooses the mean of its candidate range).
+        Here, prior_weight determines the weight of this trial configuration in the history trial configurations.
+        With prior weight 1.0, the search space is treated as one good trial.
+        For example, "normal(0, 1)" effectly equals to a trial with x = 0 which has yielded good result.
+    gamma: float (default: 0.25)
+        Controls how many trials are considered "good".
+        The number is calculated as "min(gamma * sqrt(N), linear_forgetting)".
+    """
+    constant_liar_type: Optional[str] = 'best'
+    n_startup_jobs: int = 20
+    n_ei_candidates: int = 24
+    linear_forgetting: int = 25
+    prior_weight: float = 1.0
+    gamma: float = 0.25
+class TpeTuner(Tuner):
+    """
+    Parameters
+    ==========
+    optimze_mode: 'minimize' | 'maximize' (default: 'minimize')
+        Whether optimize to minimize or maximize trial result.
+    seed: int | None
+        The random seed.
+    tpe_args: dict[string, Any] | None
+        Advanced users can use this to customize TPE tuner.
+        See `TpeArguments` for details.
+    """
+    def __init__(self, optimize_mode='minimize', seed=None, tpe_args=None):
+        self.optimize_mode = OptimizeMode(optimize_mode)
+        self.args = TpeArguments(**(tpe_args or {}))
+        self.space = None
+        # concurrent generate_parameters() calls are likely to yield similar result, because they use same history
+        # the liar solves this problem by adding fake results to history
+        self.liar = create_liar(self.args.constant_liar_type)
+        if seed is None:  # explicitly generate a seed to make the experiment reproducible
+            seed = np.random.default_rng().integers(2 ** 31)
+        self.rng = np.random.default_rng(seed)
+        _logger.info(f'Using random seed {seed}')
+        self._params = {}                   # parameter_id -> parameters (in internal format)
+        self._running_params = {}           # subset of above, that has been submitted but has not yet received loss
+        self._history = defaultdict(list)   # parameter key -> list of Record
+    def update_search_space(self, space):
+        self.space = format_search_space(space)
+    def generate_parameters(self, parameter_id, **kwargs):
+        if self.liar and self._running_params:
+            # give a fake loss for each concurrently running paramater set
+            history = {key: records.copy() for key, records in self._history.items()}  # copy history
+            lie = self.liar.lie()
+            for param in self._running_params.values():
+                for key, value in param.items():
+                    history[key].append(Record(value, lie))
+        else:
+            history = self._history
+        params = suggest(self.args, self.rng, self.space, history)
+        self._params[parameter_id] = params
+        self._running_params[parameter_id] = params
+        return deformat_parameters(params, self.space)
+    def receive_trial_result(self, parameter_id, _parameters, loss, **kwargs):
+        if self.optimize_mode is OptimizeMode.Maximize:
+            loss = -loss
+        if self.liar:
+            self.liar.update(loss)
+        params = self._running_params.pop(parameter_id)
+        for key, value in params.items():
+            self._history[key].append(Record(value, loss))
+    def trial_end(self, parameter_id, _success, **kwargs):
+        self._running_params.pop(parameter_id, None)
+    def import_data(self, data):  # for resuming experiment
+        for trial in data:
+            param = format_parameters(trial['parameter'], self.space)
+            loss = trial['value']
+            if self.optimize_mode is OptimizeMode.Maximize:
+                loss = -trial['value']
+            for key, value in param.items():
+                self._history[key].append(Record(value, loss))
+        _logger.info(f'Replayed {len(data)} trials')
+def suggest(args, rng, space, history):
+    params = {}
+    for key, spec in space.items():
+        if spec.is_activated_in(params):  # nested search space is chosen
+            params[key] = suggest_parameter(args, rng, spec, history[key])
+    return params
+def suggest_parameter(args, rng, spec, parameter_history):
+    if len(parameter_history) < args.n_startup_jobs:  # not enough history, still warming up
+        return random_tuner.suggest_parameter(rng, spec)
+    if spec.categorical:
+        return suggest_categorical(args, rng, parameter_history, spec.size)
+    if spec.normal_distributed:
+        mu = spec.mu
+        sigma = spec.sigma
+        clip = None
+    else:
+        # TPE does not support uniform distribution natively
+        # they are converted to normal((low + high) / 2, high - low)
+        mu = (spec.low + spec.high) * 0.5
+        sigma = spec.high - spec.low
+        clip = (spec.low, spec.high)
+    return suggest_normal(args, rng, parameter_history, mu, sigma, clip)
+## Public API part end ##
+## Utilities part ##
+class Record(NamedTuple):
+    param: Union[int, float]
+    loss: float
+class BestLiar:  # assume running parameters have best result, it accelerates "converging"
+    def __init__(self):
+        self._best = None
+    def update(self, loss):
+        if self._best is None or loss < self._best:
+            self._best = loss
+    def lie(self):
+        # when there is no real result, all of history is the same lie, so the value does not matter
+        # in this case, return 0 instead of infinity to prevent potential calculation error
+        return 0.0 if self._best is None else self._best
+class WorstLiar:  # assume running parameters have worst result, it helps to jump out of local minimum
+    def __init__(self):
+        self._worst = None
+    def update(self, loss):
+        if self._worst is None or loss > self._worst:
+            self._worst = loss
+    def lie(self):
+        return 0.0 if self._worst is None else self._worst
+class MeanLiar:  # assume running parameters have average result
+    def __init__(self):
+        self._sum = 0.0
+        self._n = 0
+    def update(self, loss):
+        self._sum += loss
+        self._n += 1
+    def lie(self):
+        return 0.0 if self._n == 0 else (self._sum / self._n)
+def create_liar(liar_type):
+    if liar_type is None or liar_type.lower == 'none':
+        return None
+    liar_classes = {
+        'best': BestLiar,
+        'worst': WorstLiar,
+        'mean': MeanLiar,
+    }
+    return liar_classes[liar_type.lower()]()
+## Utilities part end ##
+## Algorithm part ##
+# the algorithm is implemented in process-oriented style because I find it's easier to be understood in this way,
+# you know exactly what data each step is processing.
+def suggest_categorical(args, rng, param_history, size):
+    """
+    Suggest a categorical ("choice" or "randint") parameter.
+    """
+    below, above = split_history(args, param_history)  # split history into good ones and bad ones
+    weights = linear_forgetting_weights(args, len(below))
+    counts = np.bincount(below, weights, size)
+    p = (counts + args.prior_weight) / sum(counts + args.prior_weight)  # calculate weight of good choices
+    samples = rng.choice(size, args.n_ei_candidates, p=p)  # sample N EIs using the weights
+    below_llik = np.log(p[samples])  # the probablity of these samples to be good (llik means log-likelyhood)
+    weights = linear_forgetting_weights(args, len(above))
+    counts = np.bincount(above, weights, size)
+    p = (counts + args.prior_weight) / sum(counts + args.prior_weight)  # calculate weight of bad choices
+    above_llik = np.log(p[samples])  # the probablity of above samples to be bad
+    return samples[np.argmax(below_llik - above_llik)]  # which one has best probability to be good
+def suggest_normal(args, rng, param_history, prior_mu, prior_sigma, clip):
+    """
+    Suggest a normal distributed parameter.
+    Uniform has been converted to normal in the caller function; log and q will be handled by "deformat_parameters".
+    """
+    below, above = split_history(args, param_history)  # split history into good ones and bad ones
+    weights, mus, sigmas = adaptive_parzen_normal(args, below, prior_mu, prior_sigma)  # calculate weight of good segments
+    samples = gmm1(args, rng, weights, mus, sigmas, clip)  # sample N EIs using the weights
+    below_llik = gmm1_lpdf(args, samples, weights, mus, sigmas, clip)  # the probability of these samples to be good
+    weights, mus, sigmas = adaptive_parzen_normal(args, above, prior_mu, prior_sigma)  # calculate weight of bad segments
+    above_llik = gmm1_lpdf(args, samples, weights, mus, sigmas, clip)  # the probability of above samples to be bad
+    return samples[np.argmax(below_llik - above_llik)]  # which one has best probability to be good
+def split_history(args, param_history):
+    """
+    Divide trials into good ones (below) and bad ones (above).
+    """
+    n_below = math.ceil(args.gamma * math.sqrt(len(param_history)))
+    n_below = min(n_below, args.linear_forgetting)
+    order = sorted(range(len(param_history)), key=(lambda i: param_history[i].loss))  # argsort by loss
+    below = [param_history[i].param for i in order[:n_below]]
+    above = [param_history[i].param for i in order[n_below:]]
+    return np.asarray(below), np.asarray(above)
+def linear_forgetting_weights(args, n):
+    """
+    Calculate decayed weights of N trials.
+    """
+    lf = args.linear_forgetting
+    if n < lf:
+        return np.ones(n)
+    else:
+        ramp = np.linspace(1.0 / n, 1.0, n - lf)
+        flat = np.ones(lf)
+        return np.concatenate([ramp, flat])
+def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
+    """
+    The "Adaptive Parzen Estimator" described in paper section 4.2, for normal distribution.
+    Because TPE internally only supports categorical and normal distributed space (domain),
+    this function is used for everything other than "choice" and "randint".
+    Parameters
+    ==========
+    args: TpeArguments
+        Algorithm arguments.
+    history_mus: 1-d array of float
+        Parameter values evaluated in history.
+        These are the "observations" in paper section 4.2. ("placing density in the vicinity of K observations")
+    prior_mu: float
+        µ value of normal search space.
+    piror_sigma: float
+        σ value of normal search space.
+    Returns
+    =======
+    Tuple of three 1-d float arrays: (weight, µ, σ).
+    The tuple represents N+1 "vicinity of observations" and each one's weight,
+    calculated from "N" history and "1" user provided prior.
+    The result is sorted by µ.
+    """
+    mus = np.append(history_mus, prior_mu)
+    order = np.argsort(mus)
+    mus = mus[order]
+    prior_index = np.searchsorted(mus, prior_mu)
+    if len(mus) == 1:
+        sigmas = np.asarray([prior_sigma])
+    elif len(mus) == 2:
+        sigmas = np.asarray([prior_sigma * 0.5, prior_sigma * 0.5])
+        sigmas[prior_index] = prior_sigma
+    else:
+        l_delta = mus[1:-1] - mus[:-2]
+        r_delta = mus[2:] - mus[1:-1]
+        sigmas_mid = np.maximum(l_delta, r_delta)
+        sigmas = np.concatenate([[mus[1] - mus[0]], sigmas_mid, [mus[-1] - mus[-2]]])
+        sigmas[prior_index] = prior_sigma
+    # "magic formula" in official implementation
+    n = min(100, len(mus) + 1)
+    sigmas = np.clip(sigmas, prior_sigma / n, prior_sigma)
+    weights = np.append(linear_forgetting_weights(args, len(mus)), args.prior_weight)
+    weights = weights[order]
+    return weights / np.sum(weights), mus, sigmas
+def gmm1(args, rng, weights, mus, sigmas, clip=None):
+    """
+    Gaussian Mixture Model 1D.
+    """
+    ret = np.asarray([])
+    while len(ret) < args.n_ei_candidates:
+        n = args.n_ei_candidates - len(ret)
+        active = np.argmax(rng.multinomial(1, weights, n), axis=1)
+        samples = rng.normal(mus[active], sigmas[active])
+        if clip:
+            samples = samples[(clip[0] <= samples) & (samples <= clip[1])]
+        ret = np.concatenate([ret, samples])
+    return ret
+def gmm1_lpdf(_args, samples, weights, mus, sigmas, clip=None):
+    """
+    Gaussian Mixture Model 1D's log probability distribution function.
+    """
+    eps = 1e-12
+    if clip:
+        normal_cdf_low = erf((clip[0] - mus) / np.maximum(np.sqrt(2) * sigmas, eps)) * 0.5 + 0.5
+        normal_cdf_high = erf((clip[1] - mus) / np.maximum(np.sqrt(2) * sigmas, eps)) * 0.5 + 0.5
+        p_accept = np.sum(weights * (normal_cdf_high - normal_cdf_low))
+    else:
+        p_accept = 1
+    # normal lpdf
+    dist = samples.reshape(-1, 1) - mus
+    mahal = (dist / np.maximum(sigmas, eps)) ** 2
+    z = np.sqrt(2 * np.pi) * sigmas
+    coef = weights / z / p_accept
+    normal_lpdf = -0.5 * mahal + np.log(coef)
+    # log sum rows
+    m = normal_lpdf.max(axis=1)
+    e = np.exp(normal_lpdf - m.reshape(-1, 1))
+    return np.log(e.sum(axis=1)) + m
+## Algorithm part end ##
--- a/nni/common/hpo_utils/formatting.py
+++ b/nni/common/hpo_utils/formatting.py
@@ -14,6 +14,7 @@ You should check its code before reading docstrings in this file.
 __all__ = [
    'ParameterSpec',
    'deformat_parameters',
+    'format_parameters',
    'format_search_space',
 ]
@@ -21,6 +22,8 @@ import math
 from types import SimpleNamespace
 from typing import Any, List, NamedTuple, Optional, Tuple
+import numpy as np
 class ParameterSpec(NamedTuple):
    """
    Specification (aka space / range / domain) of one single parameter.
@@ -73,7 +76,7 @@ def format_search_space(search_space):
    # Remove these comments when we drop 3.6 support.
    return {spec.key: spec for spec in formatted}
-def deformat_parameters(parameters, formatted_search_space):
+def deformat_parameters(formatted_parameters, formatted_search_space):
    """
    Convert internal format parameters to users' expected format.
@@ -86,12 +89,12 @@ def deformat_parameters(parameters, formatted_search_space):
     4. For nested choices, convert flatten key-value pairs into nested structure.
    """
    ret = {}
-    for key, x in parameters.items():
+    for key, x in formatted_parameters.items():
        spec = formatted_search_space[key]
        if spec.categorical:
            if spec.type == 'randint':
                lower = min(math.ceil(float(x)) for x in spec.values)
-                _assign(ret, key, lower + x)
+                _assign(ret, key, int(lower + x))
            elif _is_nested_choices(spec.values):
                _assign(ret, tuple([*key, '_name']), spec.values[x]['_name'])
            else:
@@ -104,9 +107,41 @@ def deformat_parameters(parameters, formatted_search_space):
            if spec.clip:
                x = max(x, spec.clip[0])
                x = min(x, spec.clip[1])
+            if isinstance(x, np.number):
+                x = x.item()
            _assign(ret, key, x)
    return ret
+def format_parameters(parameters, formatted_search_space):
+    """
+    Convert end users' parameter format back to internal format, mainly for resuming experiments.
+    The result is not accurate for "q*" and for "choice" that have duplicate candidates.
+    """
+    # I don't like this function. It's better to use checkpoint for resuming.
+    ret = {}
+    for key, spec in formatted_search_space.items():
+        if not spec.is_activated_in(ret):
+            continue
+        value = parameters
+        for name in key:
+            if isinstance(name, str):
+                value = value[name]
+        if spec.categorical:
+            if spec.type == 'randint':
+                lower = min(math.ceil(float(x)) for x in spec.values)
+                ret[key] = value - lower
+            elif _is_nested_choices(spec.values):
+                names = [nested['_name'] for nested in spec.values]
+                ret[key] = names.index(value['_name'])
+            else:
+                ret[key] = spec.values.index(value)
+        else:
+            if spec.log_distributed:
+                value = math.log(value)
+            ret[key] = value
+    return ret
 def _format_search_space(parent_key, space):
    formatted = []
    for name, spec in space.items():

--- a/nni/runtime/default_config/registered_algorithms.yml
+++ b/nni/runtime/default_config/registered_algorithms.yml
@@ -26,6 +26,9 @@ tuners:
  className: nni.algorithms.hpo.smac_tuner.SMACTuner
  source: nni
 - builtinName: TPE
+  className: nni.algorithms.hpo.tpe_tuner.TpeTuner
+  source: nni
+- builtinName: TPE_legacy
  classArgs:
    algorithm_name: tpe
  classArgsValidator: nni.algorithms.hpo.hyperopt_tuner.HyperoptClassArgsValidator

--- a/test/ut/sdk/test_builtin_tuners.py
+++ b/test/ut/sdk/test_builtin_tuners.py
@@ -19,7 +19,9 @@ from nni.algorithms.hpo.gridsearch_tuner import GridSearchTuner
 from nni.algorithms.hpo.hyperopt_tuner import HyperoptTuner
 from nni.algorithms.hpo.metis_tuner import MetisTuner
 from nni.algorithms.hpo.pbt_tuner import PBTTuner
+from nni.algorithms.hpo.random_tuner import RandomTuner
 from nni.algorithms.hpo.regularized_evolution_tuner import RegularizedEvolutionTuner
+from nni.algorithms.hpo.tpe_tuner import TpeTuner
 from nni.runtime.msg_dispatcher import _pack_parameter, MsgDispatcher
 smac_imported = False
@@ -319,14 +321,12 @@ class BuiltinTunersTestCase(TestCase):
        self.import_data_test(tuner_fn)
    def test_tpe(self):
-        tuner_fn = lambda: HyperoptTuner("tpe")
+        tuner_fn = TpeTuner
-        self.search_space_test_all(tuner_fn,
+        self.search_space_test_all(TpeTuner)
-                                   ignore_types=["uniform_equal", "qloguniform_equal", "loguniform_equal", "quniform_clip_2"])
-        # NOTE: types are ignored because `tpe.py line 465, in adaptive_parzen_normal assert prior_sigma > 0`
        self.import_data_test(tuner_fn)
    def test_random_search(self):
-        tuner_fn = lambda: HyperoptTuner("random_search")
+        tuner_fn = RandomTuner
        self.search_space_test_all(tuner_fn)
        self.import_data_test(tuner_fn)

--- a/test/ut/sdk/test_hpo_formatting.py
+++ b/test/ut/sdk/test_hpo_formatting.py
 from math import exp, log
-from nni.common.hpo_utils import deformat_parameters, format_search_space
+from nni.common.hpo_utils import deformat_parameters, format_parameters, format_search_space
 user_space = {
    'pool': { '_type': 'choice', '_value': ['max', 'min', 'avg'] },
@@ -35,7 +35,6 @@ user_space = {
    },
 }
 spec_names = ['pool', 'kernel', 'D', 'dropout', 'hidden', 'U_lr', 'U_batch', 'dropout', 'hidden', 'N_lr', 'N_batch', 'not_nested']
 spec_types = ['choice', 'randint', 'choice', 'uniform', 'quniform', 'loguniform', 'qloguniform', 'normal', 'qnormal', 'lognormal', 'qlognormal', 'choice']
 spec_values = [['max','min','avg'], [2,8], user_space['D']['_value'], [0.5,0.9], [100.0,1000.0,3.0], [0.0001,0.1], [16.0,128.0,0.725], [0.7,0.2], [500.0,200.0,3.0], [-6.0,3.0], [3.5,1.2,0.725], [{'x':0,'y':0},{'x':1,'y':2}]]
@@ -95,6 +94,11 @@ user_params_1 = {
    'not_nested': {'x': 0, 'y': 0},
 }
+resume_params_1 = dict(internal_params_1)
+resume_params_1[('D', 0, 'hidden')] = 100.0
+resume_params_1[('D', 0, 'U_lr')] = log(exp(-4.6))
+resume_params_1[('D', 0, 'U_batch')] = log(54.375)
 internal_params_2 = {
    ('pool',): 2,
    ('kernel',): 0,
@@ -119,6 +123,11 @@ user_params_2 = {
    'not_nested': {'x': 1, 'y': 2},
 }
+resume_params_2 = dict(internal_params_2)
+resume_params_2[('D', 1, 'hidden')] = 99.0
+resume_params_2[('D', 1, 'N_lr')] = log(exp(-4.6))
+resume_params_2[('D', 1, 'N_batch')] = log(54.375)
 internal_params_3 = {
    ('pool',): 1,
    ('kernel',): 1,
@@ -135,12 +144,20 @@ user_params_3 = {
    'not_nested': {'x': 1, 'y': 2},
 }
+resume_params_3 = dict(internal_params_3)
 def test_deformatting():
    internal_space = format_search_space(user_space)
    assert deformat_parameters(internal_params_1, internal_space) == user_params_1
    assert deformat_parameters(internal_params_2, internal_space) == user_params_2
    assert deformat_parameters(internal_params_3, internal_space) == user_params_3
+def test_resuming():
+    internal_space = format_search_space(user_space)
+    assert format_parameters(user_params_1, internal_space) == resume_params_1
+    assert format_parameters(user_params_2, internal_space) == resume_params_2
+    assert format_parameters(user_params_3, internal_space) == resume_params_3
 def test_activate():
    internal_space = format_search_space(user_space)
@@ -165,4 +182,5 @@ def test_activate():
 if __name__ == '__main__':
    test_formatting()
    test_deformatting()
+    test_resuming()
    test_activate()