Unverified Commit 51d261e7 authored by J-shang's avatar J-shang Committed by GitHub
Browse files

Merge pull request #4668 from microsoft/doc-refactor

parents d63a2ea3 b469e1c1
...@@ -124,7 +124,21 @@ class WeightScoreTrainerBasedDataCollector(TrainerBasedDataCollector): ...@@ -124,7 +124,21 @@ class WeightScoreTrainerBasedDataCollector(TrainerBasedDataCollector):
class MovementPruner(BasicPruner): class MovementPruner(BasicPruner):
""" r"""
Movement pruner is an implementation of movement pruning.
This is a "fine-pruning" algorithm, which means the masks may change during each fine-tuning step.
Each weight element will be scored by the opposite of the sum of the product of weight and its gradient during each step.
This means the weight elements moving towards zero will accumulate negative scores, the weight elements moving away from zero will accumulate positive scores.
The weight elements with low scores will be masked during inference.
The following figure from the paper shows the weight pruning by movement pruning.
.. image:: ../../img/movement_pruning.png
:target: ../../img/movement_pruning.png
:alt:
For more details, please refer to `Movement Pruning: Adaptive Sparsity by Fine-Tuning <https://arxiv.org/abs/2005.07683>`__.
Parameters Parameters
---------- ----------
model : torch.nn.Module model : torch.nn.Module
...@@ -158,7 +172,7 @@ class MovementPruner(BasicPruner): ...@@ -158,7 +172,7 @@ class MovementPruner(BasicPruner):
model.train(mode=training) model.train(mode=training)
traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer) traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
The traced optimizer instance which the optimizer class is wrapped by nni.trace. The traced optimizer instance which the optimizer class is wrapped by nni.trace.
E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()). E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
criterion : Callable[[Tensor, Tensor], Tensor] criterion : Callable[[Tensor, Tensor], Tensor]
The criterion function used in trainer. Take model output and target value as input, and return the loss. The criterion function used in trainer. Take model output and target value as input, and return the loss.
training_epochs : int training_epochs : int
...@@ -171,6 +185,21 @@ class MovementPruner(BasicPruner): ...@@ -171,6 +185,21 @@ class MovementPruner(BasicPruner):
The number of steps at which sparsity stops growing, note that the sparsity stop growing doesn't mean masks not changed. The number of steps at which sparsity stops growing, note that the sparsity stop growing doesn't mean masks not changed.
The sparsity after each `optimizer.step()` is: The sparsity after each `optimizer.step()` is:
total_sparsity * (1 - (1 - (current_step - warm_up_step) / (cool_down_beginning_step - warm_up_step)) ** 3). total_sparsity * (1 - (1 - (current_step - warm_up_step) / (cool_down_beginning_step - warm_up_step)) ** 3).
Examples
--------
>>> import nni
>>> from nni.algorithms.compression.v2.pytorch.pruning import MovementPruner
>>> model = ...
>>> # make sure you have used nni.trace to wrap the optimizer class before initialize
>>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
>>> trainer = ...
>>> criterion = ...
>>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
>>> pruner = MovementPruner(model, config_list, trainer, traced_optimizer, criterion, 10, 3000, 27000)
>>> masked_model, masks = pruner.compress()
For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/movement_pruning_glue.py <examples/model_compress/pruning/v2/movement_pruning_glue.py>`
""" """
def __init__(self, model: Module, config_list: List[Dict], trainer: Callable[[Module, Optimizer, Callable], None], def __init__(self, model: Module, config_list: List[Dict], trainer: Callable[[Module, Optimizer, Callable], None],
traced_optimizer: Traceable, criterion: Callable[[Tensor, Tensor], Tensor], training_epochs: int, warm_up_step: int, traced_optimizer: Traceable, criterion: Callable[[Tensor, Tensor], Tensor], training_epochs: int, warm_up_step: int,
......
...@@ -22,15 +22,14 @@ _logger = logging.getLogger(__name__) ...@@ -22,15 +22,14 @@ _logger = logging.getLogger(__name__)
class DataCollector: class DataCollector:
""" """
An abstract class for collect the data needed by the compressor. An abstract class for collect the data needed by the compressor.
Parameters
----------
compressor
The compressor binded with this DataCollector.
""" """
def __init__(self, compressor: Compressor): def __init__(self, compressor: Compressor):
"""
Parameters
----------
compressor
The compressor binded with this DataCollector.
"""
self.compressor = compressor self.compressor = compressor
def reset(self): def reset(self):
...@@ -242,42 +241,43 @@ class TrainerBasedDataCollector(DataCollector): ...@@ -242,42 +241,43 @@ class TrainerBasedDataCollector(DataCollector):
class MetricsCalculator: class MetricsCalculator:
""" """
An abstract class for calculate a kind of metrics of the given data. An abstract class for calculate a kind of metrics of the given data.
"""
def __init__(self, dim: Optional[Union[int, List[int]]] = None,
block_sparse_size: Optional[Union[int, List[int]]] = None):
"""
Parameters
----------
dim
The dimensions that corresponding to the under pruning weight dimensions in collected data.
None means one-to-one correspondence between pruned dimensions and data, which equal to set `dim` as all data dimensions.
Only these `dim` will be kept and other dimensions of the data will be reduced.
Example: Parameters
----------
dim
The dimensions that corresponding to the under pruning weight dimensions in collected data.
None means one-to-one correspondence between pruned dimensions and data, which equal to set `dim` as all data dimensions.
Only these `dim` will be kept and other dimensions of the data will be reduced.
If you want to prune the Conv2d weight in filter level, and the weight size is (32, 16, 3, 3) [out-channel, in-channel, kernal-size-1, kernal-size-2]. Example:
Then the under pruning dimensions is [0], which means you want to prune the filter or out-channel.
Case 1: Directly collect the conv module weight as data to calculate the metric. If you want to prune the Conv2d weight in filter level, and the weight size is (32, 16, 3, 3) [out-channel, in-channel, kernal-size-1, kernal-size-2].
Then the data has size (32, 16, 3, 3). Then the under pruning dimensions is [0], which means you want to prune the filter or out-channel.
Mention that the dimension 0 of the data is corresponding to the under pruning weight dimension 0.
So in this case, `dim=0` will set in `__init__`.
Case 2: Use the output of the conv module as data to calculate the metric. Case 1: Directly collect the conv module weight as data to calculate the metric.
Then the data has size (batch_num, 32, feature_map_size_1, feature_map_size_2). Then the data has size (32, 16, 3, 3).
Mention that the dimension 1 of the data is corresponding to the under pruning weight dimension 0. Mention that the dimension 0 of the data is corresponding to the under pruning weight dimension 0.
So in this case, `dim=1` will set in `__init__`. So in this case, `dim=0` will set in `__init__`.
In both of these two case, the metric of this module has size (32,). Case 2: Use the output of the conv module as data to calculate the metric.
block_sparse_size Then the data has size (batch_num, 32, feature_map_size_1, feature_map_size_2).
This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)). Mention that the dimension 1 of the data is corresponding to the under pruning weight dimension 0.
Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim. So in this case, `dim=1` will set in `__init__`.
Example: In both of these two case, the metric of this module has size (32,).
The under pruning weight size is (768, 768), and you want to apply a block sparse on dim=[0] with block size [64, 768], block_sparse_size
then you can set block_sparse_size=[64]. The final metric size is (12,). This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
""" Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
Example:
The under pruning weight size is (768, 768), and you want to apply a block sparse on dim=[0] with block size [64, 768],
then you can set block_sparse_size=[64]. The final metric size is (12,).
"""
def __init__(self, dim: Optional[Union[int, List[int]]] = None,
block_sparse_size: Optional[Union[int, List[int]]] = None):
self.dim = dim if not isinstance(dim, int) else [dim] self.dim = dim if not isinstance(dim, int) else [dim]
self.block_sparse_size = block_sparse_size if not isinstance(block_sparse_size, int) else [block_sparse_size] self.block_sparse_size = block_sparse_size if not isinstance(block_sparse_size, int) else [block_sparse_size]
if self.block_sparse_size is not None: if self.block_sparse_size is not None:
...@@ -307,36 +307,35 @@ class MetricsCalculator: ...@@ -307,36 +307,35 @@ class MetricsCalculator:
class SparsityAllocator: class SparsityAllocator:
""" """
An abstract class for allocate mask based on metrics. An abstract class for allocate mask based on metrics.
Parameters
----------
pruner
The pruner that binded with this `SparsityAllocator`.
dim
The under pruning weight dimensions, which metric size should equal to the under pruning weight size on these dimensions.
None means one-to-one correspondence between pruned dimensions and metric, which equal to set `dim` as all under pruning weight dimensions.
The mask will expand to the weight size depend on `dim`.
Example:
The under pruning weight has size (2, 3, 4), and `dim=1` means the under pruning weight dimension is 1.
Then the metric should have a size (3,), i.e., `metric=[0.9, 0.1, 0.8]`.
Assuming by some kind of `SparsityAllocator` get the mask on weight dimension 1 `mask=[1, 0, 1]`,
then the dimension mask will expand to the final mask `[[[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]]]`.
block_sparse_size
This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
Example:
The metric size is (12,), and block_sparse_size=[64], then the mask will expand to (768,) at first before expand with `dim`.
continuous_mask
Inherit the mask already in the wrapper if set True.
""" """
def __init__(self, pruner: Compressor, dim: Optional[Union[int, List[int]]] = None, def __init__(self, pruner: Compressor, dim: Optional[Union[int, List[int]]] = None,
block_sparse_size: Optional[Union[int, List[int]]] = None, continuous_mask: bool = True): block_sparse_size: Optional[Union[int, List[int]]] = None, continuous_mask: bool = True):
"""
Parameters
----------
pruner
The pruner that binded with this `SparsityAllocator`.
dim
The under pruning weight dimensions, which metric size should equal to the under pruning weight size on these dimensions.
None means one-to-one correspondence between pruned dimensions and metric, which equal to set `dim` as all under pruning weight dimensions.
The mask will expand to the weight size depend on `dim`.
Example:
The under pruning weight has size (2, 3, 4), and `dim=1` means the under pruning weight dimension is 1.
Then the metric should have a size (3,), i.e., `metric=[0.9, 0.1, 0.8]`.
Assuming by some kind of `SparsityAllocator` get the mask on weight dimension 1 `mask=[1, 0, 1]`,
then the dimension mask will expand to the final mask `[[[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]]]`.
block_sparse_size
This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
Example:
The metric size is (12,), and block_sparse_size=[64], then the mask will expand to (768,) at first before expand with `dim`.
continuous_mask
Inherit the mask already in the wrapper if set True.
"""
self.pruner = pruner self.pruner = pruner
self.dim = dim if not isinstance(dim, int) else [dim] self.dim = dim if not isinstance(dim, int) else [dim]
self.block_sparse_size = block_sparse_size if not isinstance(block_sparse_size, int) else [block_sparse_size] self.block_sparse_size = block_sparse_size if not isinstance(block_sparse_size, int) else [block_sparse_size]
......
...@@ -200,6 +200,17 @@ def compute_sparsity(origin_model: Module, compact_model: Module, compact_model_ ...@@ -200,6 +200,17 @@ def compute_sparsity(origin_model: Module, compact_model: Module, compact_model_
The compact model is the origin model after pruning, The compact model is the origin model after pruning,
and it may have different structure with origin_model cause of speed up. and it may have different structure with origin_model cause of speed up.
Parameters
----------
origin_model : torch.nn.Module
The original un-pruned model.
compact_model : torch.nn.Module
The model after speed up or original model.
compact_model_masks: Dict[str, Dict[str, Tensor]]
The masks applied on the compact model, if the original model have been speed up, this should be {}.
config_list : List[Dict]
The config_list used by pruning the original model.
Returns Returns
------- -------
Tuple[List[Dict], List[Dict], List[Dict]] Tuple[List[Dict], List[Dict], List[Dict]]
......
...@@ -20,27 +20,64 @@ LOGGER = logging.getLogger('batch_tuner_AutoML') ...@@ -20,27 +20,64 @@ LOGGER = logging.getLogger('batch_tuner_AutoML')
class BatchTuner(Tuner): class BatchTuner(Tuner):
""" """
BatchTuner is tuner will running all the configure that user want to run batchly. Batch tuner is a special tuner that allows users to simply provide several hyperparameter sets,
and it will evaluate each set.
Batch tuner does **not** support standard search space.
Search space of batch tuner looks like a single ``choice`` in standard search space,
but it has different meaning.
Consider following search space:
.. code-block::
'combine_params': {
'_type': 'choice',
'_value': [
{'x': 0, 'y': 1},
{'x': 1, 'y': 2},
{'x': 1, 'y': 3},
]
}
Batch tuner will generate following 4 hyperparameter sets:
1. {'x': 0, 'y': 1}
2. {'x': 1, 'y': 2}
3. {'x': 1, 'y': 3}
If this search space was used with grid search tuner, it would instead generate:
1. {'combine_params': {'x': 0, 'y': 1 }}
2. {'combine_params': {'x': 1, 'y': 2 }}
3. {'combine_params': {'x': 1, 'y': 3 }}
Examples Examples
-------- --------
The search space only be accepted like:
:: .. code-block::
{'combine_params': config.search_space = {
{ '_type': 'choice', 'combine_params': {
'_value': '[{...}, {...}, {...}]', '_type': 'choice',
} '_value': [
{'optimizer': 'Adam', 'learning_rate': 0.001},
{'optimizer': 'Adam', 'learning_rate': 0.0001},
{'optimizer': 'Adam', 'learning_rate': 0.00001},
{'optimizer': 'SGD', 'learning_rate': 0.01},
{'optimizer': 'SGD', 'learning_rate': 0.005},
]
} }
}
config.tuner.name = 'BatchTuner'
""" """
def __init__(self): def __init__(self):
self._count = -1 self._count = -1
self._values = [] self._values = []
def is_valid(self, search_space): def _is_valid(self, search_space):
""" """
Check the search space is valid: only contains 'choice' type Check the search space is valid: only contains 'choice' type
...@@ -70,27 +107,10 @@ class BatchTuner(Tuner): ...@@ -70,27 +107,10 @@ class BatchTuner(Tuner):
return None return None
def update_search_space(self, search_space): def update_search_space(self, search_space):
"""Update the search space
Parameters
----------
search_space : dict
"""
validate_search_space(search_space, ['choice']) validate_search_space(search_space, ['choice'])
self._values = self.is_valid(search_space) self._values = self._is_valid(search_space)
def generate_parameters(self, parameter_id, **kwargs): def generate_parameters(self, parameter_id, **kwargs):
"""Returns a dict of trial (hyper-)parameters, as a serializable object.
Parameters
----------
parameter_id : int
Returns
-------
dict
A candidate parameter group.
"""
self._count += 1 self._count += 1
if self._count > len(self._values) - 1: if self._count > len(self._values) - 1:
raise nni.NoMoreTrialError('no more parameters now.') raise nni.NoMoreTrialError('no more parameters now.')
...@@ -100,13 +120,6 @@ class BatchTuner(Tuner): ...@@ -100,13 +120,6 @@ class BatchTuner(Tuner):
pass pass
def import_data(self, data): def import_data(self, data):
"""Import additional data for tuning
Parameters
----------
data:
a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
"""
if not self._values: if not self._values:
LOGGER.info("Search space has not been initialized, skip this data import") LOGGER.info("Search space has not been initialized, skip this data import")
return return
......
...@@ -249,20 +249,52 @@ class BOHBClassArgsValidator(ClassArgsValidator): ...@@ -249,20 +249,52 @@ class BOHBClassArgsValidator(ClassArgsValidator):
class BOHB(MsgDispatcherBase): class BOHB(MsgDispatcherBase):
""" """
BOHB performs robust and efficient hyperparameter optimization `BOHB <https://arxiv.org/abs/1807.01774>`__ is a robust and efficient hyperparameter tuning algorithm at scale.
at scale by combining the speed of Hyperband searches with the BO is an abbreviation for "Bayesian Optimization" and HB is an abbreviation for "Hyperband".
guidance and guarantees of convergence of Bayesian Optimization.
Instead of sampling new configurations at random, BOHB uses BOHB relies on HB (Hyperband) to determine how many configurations to evaluate with which budget,
kernel density estimators to select promising candidates. but it replaces the random selection of configurations at the beginning of each HB iteration
by a model-based search (Bayesian Optimization).
Once the desired number of configurations for the iteration is reached,
the standard successive halving procedure is carried out using these configurations.
It keeps track of the performance of all function evaluations g(x, b) of configurations x
on all budgets b to use as a basis for our models in later iterations.
Please refer to the paper :footcite:t:`falkner2018bohb` for detailed algorithm.
Note that BOHB needs additional installation using the following command:
.. code-block:: bash
pip install nni[BOHB]
Examples
--------
.. code-block::
config.advisor.name = 'BOHB'
config.advisor.class_args = {
'optimize_mode': 'maximize',
'min_budget': 1,
'max_budget': 27,
'eta': 3,
'min_points_in_model': 7,
'top_n_percent': 15,
'num_samples': 64,
'random_fraction': 0.33,
'bandwidth_factor': 3.0,
'min_bandwidth': 0.001
}
Parameters Parameters
---------- ----------
optimize_mode: str optimize_mode: str
optimize mode, 'maximize' or 'minimize' Optimize mode, 'maximize' or 'minimize'.
min_budget: float min_budget: float
The smallest budget to consider. Needs to be positive! The smallest budget to assign to a trial job, (budget can be the number of mini-batches or epochs).
Needs to be positive.
max_budget: float max_budget: float
The largest budget to consider. Needs to be larger than min_budget! The largest budget to assign to a trial job. Needs to be larger than min_budget.
The budgets will be geometrically distributed The budgets will be geometrically distributed
:math:`a^2 + b^2 = c^2 \\sim \\eta^k` for :math:`k\\in [0, 1, ... , num\\_subsets - 1]`. :math:`a^2 + b^2 = c^2 \\sim \\eta^k` for :math:`k\\in [0, 1, ... , num\\_subsets - 1]`.
eta: int eta: int
...@@ -271,21 +303,102 @@ class BOHB(MsgDispatcherBase): ...@@ -271,21 +303,102 @@ class BOHB(MsgDispatcherBase):
1/eta of them 'advances' to the next round. 1/eta of them 'advances' to the next round.
Must be greater or equal to 2. Must be greater or equal to 2.
min_points_in_model: int min_points_in_model: int
number of observations to start building a KDE. Default 'None' means Number of observations to start building a KDE. Default 'None' means dim+1;
dim+1, the bare minimum. when the number of completed trials in this budget is equal to or larger than ``max{dim+1, min_points_in_model}``,
BOHB will start to build a KDE model of this budget then use said KDE model to guide configuration selection.
Needs to be positive. (dim means the number of hyperparameters in search space)
top_n_percent: int top_n_percent: int
percentage ( between 1 and 99, default 15) of the observations that are considered good. Percentage (between 1 and 99, default 15) of the observations which are considered good.
Good points and bad points are used for building KDE models.
For example, if you have 100 observed trials and top_n_percent is 15,
then the top 15% of points will be used for building the good points models "l(x)".
The remaining 85% of points will be used for building the bad point models "g(x)".
num_samples: int num_samples: int
number of samples to optimize EI (default 64) Number of samples to optimize EI (default 64).
In this case, it will sample "num_samples" points and compare the result of l(x)/g(x).
Then it will return the one with the maximum l(x)/g(x) value as the next configuration
if the optimize_mode is ``maximize``. Otherwise, it returns the smallest one.
random_fraction: float random_fraction: float
fraction of purely random configurations that are sampled from the Fraction of purely random configurations that are sampled from the prior without the model.
prior without the model.
bandwidth_factor: float bandwidth_factor: float
to encourage diversity, the points proposed to optimize EI, are sampled To encourage diversity, the points proposed to optimize EI are sampled
from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3) from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3).
It is suggested to use the default value if you are not familiar with KDE.
min_bandwidth: float min_bandwidth: float
to keep diversity, even when all (good) samples have the same value for one of the parameters, To keep diversity, even when all (good) samples have the same value for one of the parameters,
a minimum bandwidth (Default: 1e-3) is used instead of zero. a minimum bandwidth (default: 1e-3) is used instead of zero.
It is suggested to use the default value if you are not familiar with KDE.
config_space: str
Directly use a .pcs file serialized by `ConfigSpace <https://automl.github.io/ConfigSpace/>` in "pcs new" format.
In this case, search space file (if provided in config) will be ignored.
Note that this path needs to be an absolute path. Relative path is currently not supported.
Notes
-----
Below is the introduction of the BOHB process separated in two parts:
**The first part HB (Hyperband).**
BOHB follows Hyperband’s way of choosing the budgets and continue to use SuccessiveHalving.
For more details, you can refer to the :class:`nni.algorithms.hpo.hyperband_advisor.Hyperband`
and the `reference paper for Hyperband <https://arxiv.org/abs/1603.06560>`__.
This procedure is summarized by the pseudocode below.
.. image:: ../../img/bohb_1.png
:scale: 80 %
:align: center
**The second part BO (Bayesian Optimization)**
The BO part of BOHB closely resembles TPE with one major difference:
It opted for a single multidimensional KDE compared to the hierarchy of one-dimensional KDEs used in TPE
in order to better handle interaction effects in the input space.
Tree Parzen Estimator(TPE): uses a KDE (kernel density estimator) to model the densities.
.. image:: ../../img/bohb_2.png
:scale: 80 %
:align: center
To fit useful KDEs, we require a minimum number of data points Nmin;
this is set to d + 1 for our experiments, where d is the number of hyperparameters.
To build a model as early as possible, we do not wait until Nb = \|Db\|,
where the number of observations for budget b is large enough to satisfy q · Nb ≥ Nmin.
Instead, after initializing with Nmin + 2 random configurations, we choose the
best and worst configurations, respectively, to model the two densities.
Note that it also samples a constant fraction named **random fraction** of the configurations uniformly at random.
.. image:: ../../img/bohb_3.png
:scale: 80 %
:align: center
.. image:: ../../img/bohb_6.jpg
:scale: 65 %
:align: center
**The above image shows the workflow of BOHB.**
Here set max_budget = 9, min_budget = 1, eta = 3, others as default.
In this case, s_max = 2, so we will continuously run the {s=2, s=1, s=0, s=2, s=1, s=0, ...} cycle.
In each stage of SuccessiveHalving (the orange box), it will pick the top 1/eta configurations and run them again with more budget,
repeating the SuccessiveHalving stage until the end of this iteration.
At the same time, it collects the configurations, budgets and final metrics of each trial
and use these to build a multidimensional KDEmodel with the key "budget".
Multidimensional KDE is used to guide the selection of configurations for the next iteration.
The sampling procedure (using Multidimensional KDE to guide selection) is summarized by the pseudocode below.
.. image:: ../../img/bohb_4.png
:scale: 80 %
:align: center
**Here is a simple experiment which tunes MNIST with BOHB.**
Code implementation: :githublink:`examples/trials/mnist-advisor <examples/trials/mnist-advisor>`
The following is the experimental final results:
.. image:: ../../img/bohb_5.png
:scale: 80 %
:align: center
More experimental results can be found in the `reference paper <https://arxiv.org/abs/1807.01774>`__.
It shows that BOHB makes good use of previous results and has a balanced trade-off in exploration and exploitation.
""" """
def __init__(self, def __init__(self,
......
...@@ -22,18 +22,52 @@ class CurvefittingClassArgsValidator(ClassArgsValidator): ...@@ -22,18 +22,52 @@ class CurvefittingClassArgsValidator(ClassArgsValidator):
}).validate(kwargs) }).validate(kwargs)
class CurvefittingAssessor(Assessor): class CurvefittingAssessor(Assessor):
"""CurvefittingAssessor uses learning curve fitting algorithm to predict the learning curve performance in the future. """
CurvefittingAssessor uses learning curve fitting algorithm to predict the learning curve performance in the future.
The intermediate result **must** be accuracy. Curve fitting does not support minimizing loss.
Curve fitting assessor is an LPA (learning, predicting, assessing) algorithm.
It stops a pending trial X at step S if the trial's forecast result at target step is convergence and lower than the It stops a pending trial X at step S if the trial's forecast result at target step is convergence and lower than the
best performance in the history. best performance in the history.
Paper: `Speeding up Automatic Hyperparameter Optimization of Deep Neural Networks by Extrapolation of Learning Curves
<https://ml.informatik.uni-freiburg.de/wp-content/uploads/papers/15-IJCAI-Extrapolation_of_Learning_Curves.pdf>`__
Examples
--------
.. code-block::
config.assessor.name = 'Curvefitting'
config.tuner.class_args = {
'epoch_num': 20,
'start_step': 6,
'threshold': 9,
'gap': 1,
}
Parameters Parameters
---------- ----------
epoch_num : int epoch_num : int
The total number of epoch The total number of epochs.
We need to know the number of epochs to determine which points we need to predict.
start_step : int start_step : int
only after receiving start_step number of reported intermediate results A trial is determined to be stopped or not only after receiving start_step number of intermediate results.
threshold : float threshold : float
The threshold that we decide to early stop the worse performance curve. The threshold that we use to decide to early stop the worst performance curve.
For example: if threshold = 0.95, and the best performance in the history is 0.9,
then we will stop the trial who's predicted value is lower than 0.95 * 0.9 = 0.855.
gap : int
The gap interval between assessor judgements.
For example: if gap = 2, start_step = 6,
then we will assess the result when we get 6, 8, 10, 12, ... intermediate results.
""" """
def __init__(self, epoch_num=20, start_step=6, threshold=0.95, gap=1): def __init__(self, epoch_num=20, start_step=6, threshold=0.95, gap=1):
...@@ -56,15 +90,6 @@ class CurvefittingAssessor(Assessor): ...@@ -56,15 +90,6 @@ class CurvefittingAssessor(Assessor):
logger.info('Successfully initials the curvefitting assessor') logger.info('Successfully initials the curvefitting assessor')
def trial_end(self, trial_job_id, success): def trial_end(self, trial_job_id, success):
"""update the best performance of completed trial job
Parameters
----------
trial_job_id : int
trial job id
success : bool
True if succssfully finish the experiment, False otherwise
"""
if success: if success:
if self.set_best_performance: if self.set_best_performance:
self.completed_best_performance = max(self.completed_best_performance, self.trial_history[-1]) self.completed_best_performance = max(self.completed_best_performance, self.trial_history[-1])
...@@ -76,25 +101,6 @@ class CurvefittingAssessor(Assessor): ...@@ -76,25 +101,6 @@ class CurvefittingAssessor(Assessor):
logger.info('No need to update, trial job id: %s', trial_job_id) logger.info('No need to update, trial job id: %s', trial_job_id)
def assess_trial(self, trial_job_id, trial_history): def assess_trial(self, trial_job_id, trial_history):
"""assess whether a trial should be early stop by curve fitting algorithm
Parameters
----------
trial_job_id : int
trial job id
trial_history : list
The history performance matrix of each trial
Returns
-------
bool
AssessResult.Good or AssessResult.Bad
Raises
------
Exception
unrecognize exception in curvefitting_assessor
"""
scalar_trial_history = extract_scalar_history(trial_history) scalar_trial_history = extract_scalar_history(trial_history)
self.trial_history = scalar_trial_history self.trial_history = scalar_trial_history
if not self.set_best_performance: if not self.set_best_performance:
......
...@@ -44,7 +44,20 @@ def _random_config(search_space, random_state): ...@@ -44,7 +44,20 @@ def _random_config(search_space, random_state):
class DNGOTuner(Tuner): class DNGOTuner(Tuner):
"""
Use neural networks as an alternative to GPs to model distributions over functions in bayesian optimization.
Parameters
----------
optimize : maximize | minimize, default = maximize
If 'maximize', the tuner will target to maximize metrics. If 'minimize', the tuner will target to minimize metrics.
sample_size : int, default = 1000
Number of samples to select in each iteration. The best one will be picked from the samples as the next trial.
trials_per_update : int, default = 20
Number of trials to collect before updating the model.
num_epochs_per_training : int, default = 500
Number of epochs to train DNGO model.
"""
def __init__(self, optimize_mode='maximize', sample_size=1000, trials_per_update=20, num_epochs_per_training=500): def __init__(self, optimize_mode='maximize', sample_size=1000, trials_per_update=20, num_epochs_per_training=500):
self.searchspace_json = None self.searchspace_json = None
self.random_state = None self.random_state = None
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
""" """
evolution_tuner.py evolution_tuner.py
""" """
from __future__ import annotations
import copy import copy
import random import random
...@@ -22,28 +23,19 @@ logger = logging.getLogger(__name__) ...@@ -22,28 +23,19 @@ logger = logging.getLogger(__name__)
class Individual: class Individual:
""" """
Indicidual class to store the indv info. Individual class to store the indv info.
Attributes Parameters
---------- ----------
config : str config : str, default = None
Search space. Search space.
info : str info : str, default = None
The str to save information of individual. The str to save information of individual.
result : float result : float, None = None
The final metric of a individual. The final metric of a individual.
""" """
def __init__(self, config=None, info=None, result=None): def __init__(self, config=None, info=None, result=None):
"""
Parameters
----------
config : str
A config to represent a group of parameters.
info : str
result : float
save_dir : str
"""
self.config = config self.config = config
self.result = result self.result = result
self.info = info self.info = info
...@@ -61,18 +53,36 @@ class EvolutionClassArgsValidator(ClassArgsValidator): ...@@ -61,18 +53,36 @@ class EvolutionClassArgsValidator(ClassArgsValidator):
class EvolutionTuner(Tuner): class EvolutionTuner(Tuner):
""" """
EvolutionTuner is tuner using navie evolution algorithm. Naive Evolution comes from `Large-Scale Evolution of Image Classifiers <https://arxiv.org/pdf/1703.01041.pdf>`__
It randomly initializes a population based on the search space.
For each generation, it chooses better ones and does some mutation.
(e.g., changes a hyperparameter, adds/removes one layer, etc.) on them to get the next generation.
Naive Evolution requires many trials to works but it’s very simple and it’s easily expanded with new features.
Examples
--------
.. code-block::
config.tuner.name = 'Evolution'
config.tuner.class_args = {
'optimize_mode': 'maximize',
'population_size': 100
}
Parameters
----------
optimize_mode: str
Optimize mode, 'maximize' or 'minimize'.
If 'maximize', the tuner will try to maximize metrics. If 'minimize', the tuner will try to minimize metrics.
population_size: int
The initial size of the population (trial num) in the evolution tuner(default=32).
The larger population size, the better evolution performance.
It's suggested that ``population_size`` be much larger than ``concurrency`` so users can get the most out of the algorithm.
And at least ``concurrency``, or the tuner will fail on its first generation of parameters.
""" """
def __init__(self, optimize_mode="maximize", population_size=32): def __init__(self, optimize_mode='maximize', population_size=32):
"""
Parameters
----------
optimize_mode : str, default 'maximize'
population_size : int
initial population size. The larger population size,
the better evolution performance.
"""
self.optimize_mode = OptimizeMode(optimize_mode) self.optimize_mode = OptimizeMode(optimize_mode)
self.population_size = population_size self.population_size = population_size
...@@ -89,11 +99,11 @@ class EvolutionTuner(Tuner): ...@@ -89,11 +99,11 @@ class EvolutionTuner(Tuner):
def update_search_space(self, search_space): def update_search_space(self, search_space):
""" """
Update search space. Update search space.
Search_space contains the information that user pre-defined. Search_space contains the information that user pre-defined.
Parameters Parameters
---------- ----------
search_space : dict search_space : dict
""" """
self.searchspace_json = search_space self.searchspace_json = search_space
...@@ -109,8 +119,10 @@ class EvolutionTuner(Tuner): ...@@ -109,8 +119,10 @@ class EvolutionTuner(Tuner):
""" """
To deal with trial failure. If a trial fails, To deal with trial failure. If a trial fails,
random generate the parameters and add into the population. random generate the parameters and add into the population.
Parameters Parameters
---------- ----------
parameter_id : int parameter_id : int
Unique identifier for hyper-parameters used by this trial. Unique identifier for hyper-parameters used by this trial.
success : bool success : bool
...@@ -136,12 +148,15 @@ class EvolutionTuner(Tuner): ...@@ -136,12 +148,15 @@ class EvolutionTuner(Tuner):
def generate_multiple_parameters(self, parameter_id_list, **kwargs): def generate_multiple_parameters(self, parameter_id_list, **kwargs):
""" """
Returns multiple sets of trial (hyper-)parameters, as iterable of serializable objects. Returns multiple sets of trial (hyper-)parameters, as iterable of serializable objects.
Parameters Parameters
---------- ----------
parameter_id_list : list of int parameter_id_list : list of int
Unique identifiers for each set of requested hyper-parameters. Unique identifiers for each set of requested hyper-parameters.
**kwargs **kwargs
Not used Not used
Returns Returns
------- -------
list list
...@@ -182,12 +197,13 @@ class EvolutionTuner(Tuner): ...@@ -182,12 +197,13 @@ class EvolutionTuner(Tuner):
Parameters Parameters
---------- ----------
parameter_id : int parameter_id : int
Returns Returns
------- -------
dict dict
A group of candaidte parameters that evolution tuner generated. A group of candidate parameters that evolution tuner generated.
""" """
pos = -1 pos = -1
...@@ -234,10 +250,12 @@ class EvolutionTuner(Tuner): ...@@ -234,10 +250,12 @@ class EvolutionTuner(Tuner):
Parameters Parameters
---------- ----------
parameter_id : int parameter_id : int
Returns Returns
------- -------
dict dict
One newly generated configuration. One newly generated configuration.
""" """
...@@ -258,6 +276,7 @@ class EvolutionTuner(Tuner): ...@@ -258,6 +276,7 @@ class EvolutionTuner(Tuner):
Parameters Parameters
---------- ----------
parameter_id : int parameter_id : int
parameters : dict parameters : dict
value : dict/float value : dict/float
......
...@@ -41,29 +41,77 @@ class GPClassArgsValidator(ClassArgsValidator): ...@@ -41,29 +41,77 @@ class GPClassArgsValidator(ClassArgsValidator):
class GPTuner(Tuner): class GPTuner(Tuner):
""" """
GPTuner is a Bayesian Optimization method where Gaussian Process is used for modeling loss functions. GPTuner is a Bayesian Optimization method where Gaussian Process
is used for modeling loss functions.
Bayesian optimization works by constructing a posterior distribution of functions
(a Gaussian Process) that best describes the function you want to optimize.
As the number of observations grows, the posterior distribution improves,
and the algorithm becomes more certain of which regions in parameter space
are worth exploring and which are not.
GPTuner is designed to minimize/maximize the number of steps required to find
a combination of parameters that are close to the optimal combination.
To do so, this method uses a proxy optimization problem (finding the maximum of
the acquisition function) that, albeit still a hard problem, is cheaper
(in the computational sense) to solve, and it's amenable to common tools.
Therefore, Bayesian Optimization is suggested for situations where sampling the function
to be optimized is very expensive.
Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
``randint``, ``uniform``, ``quniform``, ``loguniform``, ``qloguniform``, and numerical ``choice``.
This optimization approach is described in Section 3 of the paper
`Algorithms for Hyper-Parameter Optimization <https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf>`__
( :footcite:t:`bergstra2011algorithms` ).
Examples
--------
.. code-block::
config.tuner.name = 'GPTuner'
config.tuner.class_args = {
'optimize_mode': 'maximize',
'utility': 'ei',
'kappa': 5.0,
'xi': 0.0,
'nu': 2.5,
'alpha': 1e-6,
'cold_start_num': 10,
'selection_num_warm_up': 100000,
'selection_num_starting_points': 250
}
Parameters Parameters
---------- ----------
optimize_mode : str optimize_mode : str
optimize mode, 'maximize' or 'minimize', by default 'maximize' Optimize mode, 'maximize' or 'minimize', by default 'maximize'
utility : str utility : str
utility function (also called 'acquisition funcition') to use, which can be 'ei', 'ucb' or 'poi'. By default 'ei'. Utility function (also called 'acquisition funcition') to use,
which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
kappa : float kappa : float
value used by utility function 'ucb'. The bigger kappa is, the more the tuner will be exploratory. By default 5. Value used by utility function 'ucb'. The bigger kappa is,
the more the tuner will be exploratory. By default 5.
xi : float xi : float
used by utility function 'ei' and 'poi'. The bigger xi is, the more the tuner will be exploratory. By default 0. Used by utility function 'ei' and 'poi'. The bigger xi is,
the more the tuner will be exploratory. By default 0.
nu : float nu : float
used to specify Matern kernel. The smaller nu, the less smooth the approximated function is. By default 2.5. Used to specify Matern kernel. The smaller nu,
the less smooth the approximated function is. By default 2.5.
alpha : float alpha : float
Used to specify Gaussian Process Regressor. Larger values correspond to increased noise level in the observations. Used to specify Gaussian Process Regressor.
Larger values correspond to increased noise level in the observations.
By default 1e-6. By default 1e-6.
cold_start_num : int cold_start_num : int
Number of random exploration to perform before Gaussian Process. By default 10. Number of random exploration to perform before Gaussian Process.
By default 10.
selection_num_warm_up : int selection_num_warm_up : int
Number of random points to evaluate for getting the point which maximizes the acquisition function. By default 100000 Number of random points to evaluate for getting the point which
maximizes the acquisition function. By default 100000
selection_num_starting_points : int selection_num_starting_points : int
Number of times to run L-BFGS-B from a random starting point after the warmup. By default 250. Number of times to run L-BFGS-B from a random starting point after the warmup.
By default 250.
""" """
def __init__(self, optimize_mode="maximize", utility='ei', kappa=5, xi=0, nu=2.5, alpha=1e-6, cold_start_num=10, def __init__(self, optimize_mode="maximize", utility='ei', kappa=5, xi=0, nu=2.5, alpha=1e-6, cold_start_num=10,
......
...@@ -2,14 +2,10 @@ ...@@ -2,14 +2,10 @@
# Licensed under the MIT license. # Licensed under the MIT license.
""" """
Grid search tuner for hyper-parameter optimization. Grid search tuner.
For categorical parameters this tuner fully explore all combinations. For categorical parameters this tuner fully explore all combinations.
For numerical parameters it samples them at progressively decreased intervals. For numerical parameters it samples them at progressively decreased intervals.
Use this tuner if you have abundant resource and want to find strictly optimal parameters.
Grid search tuner has no argument.
""" """
__all__ = ['GridSearchTuner'] __all__ = ['GridSearchTuner']
...@@ -63,6 +59,35 @@ _logger = logging.getLogger('nni.tuner.gridsearch') ...@@ -63,6 +59,35 @@ _logger = logging.getLogger('nni.tuner.gridsearch')
## ##
class GridSearchTuner(Tuner): class GridSearchTuner(Tuner):
"""
Grid search tuner divides search space into evenly spaced grid, and performs brute-force traverse.
Recommended when the search space is small, or if you want to find strictly optimal hyperparameters.
**Implementation**
The original grid search approach performs an exhaustive search through a space consists of ``choice`` and ``randint``.
NNI's implementation extends grid search to support all search spaces types.
When the search space contains continuous parameters like ``normal`` and ``loguniform``,
grid search tuner works in following steps:
1. Divide the search space into a grid.
2. Perform an exhaustive searth through the grid.
3. Subdivide the grid into a finer-grained new grid.
4. Goto step 2, until experiment end.
As a deterministic algorithm, grid search has no argument.
Examples
--------
.. code-block::
config.tuner.name = 'GridSearch'
"""
def __init__(self): def __init__(self):
self.space = None self.space = None
......
...@@ -105,7 +105,8 @@ def json2parameter(ss_spec, random_state): ...@@ -105,7 +105,8 @@ def json2parameter(ss_spec, random_state):
class Bracket(): class Bracket():
"""A bracket in Hyperband, all the information of a bracket is managed by an instance of this class """
A bracket in Hyperband, all the information of a bracket is managed by an instance of this class
Parameters Parameters
---------- ----------
...@@ -267,24 +268,136 @@ class HyperbandClassArgsValidator(ClassArgsValidator): ...@@ -267,24 +268,136 @@ class HyperbandClassArgsValidator(ClassArgsValidator):
class Hyperband(MsgDispatcherBase): class Hyperband(MsgDispatcherBase):
""" """
Hyperband inherit from MsgDispatcherBase rather than Tuner, because it integrates both tuner's functions and assessor's functions. `Hyperband <https://arxiv.org/pdf/1603.06560.pdf>`__ is a multi-fidelity hyperparameter tuning algorithm
This is an implementation that could fully leverage available resources or follow the algorithm process, based on successive halving.
i.e., high parallelism or serial.
A single execution of Hyperband takes a finite budget of (s_max + 1)B. The basic idea of Hyperband is to create several buckets,
each having ``n`` randomly generated hyperparameter configurations,
each configuration using ``r`` resources (e.g., epoch number, batch number).
After the ``n`` configurations are finished, it chooses the top ``n/eta`` configurations
and runs them using increased ``r*eta`` resources.
At last, it chooses the best configuration it has found so far.
Please refer to the paper :footcite:t:`li2017hyperband` for detailed algorithm.
Examples
--------
.. code-block::
config.advisor.name = 'Hyperband'
config.advisor.class_args = {
'optimize_mode': 'maximize',
'R': 60,
'eta': 3
}
Note that once you use Advisor, you are not allowed to add a Tuner and Assessor spec in the config file.
When Hyperband is used, the dict returned by :func:`nni.get_next_parameter` one more key
called ``TRIAL_BUDGET`` besides the hyperparameters and their values.
**With this TRIAL_BUDGET, users can control in trial code how long a trial runs by following
the suggested trial budget from Hyperband.** ``TRIAL_BUDGET`` is a relative number,
users can interpret them as number of epochs, number of mini-batches, running time, etc.
Here is a concrete example of ``R=81`` and ``eta=3``:
.. list-table::
:header-rows: 1
:widths: auto
* -
- s=4
- s=3
- s=2
- s=1
- s=0
* - i
- n r
- n r
- n r
- n r
- n r
* - 0
- 81 1
- 27 3
- 9 9
- 6 27
- 5 81
* - 1
- 27 3
- 9 9
- 3 27
- 2 81
-
* - 2
- 9 9
- 3 27
- 1 81
-
-
* - 3
- 3 27
- 1 81
-
-
-
* - 4
- 1 81
-
-
-
-
``s`` means bucket, ``n`` means the number of configurations that are generated,
the corresponding ``r`` means how many budgets these configurations run.
``i`` means round, for example, bucket 4 has 5 rounds, bucket 3 has 4 rounds.
A complete example can be found :githublink:`examples/trials/mnist-advisor`.
Parameters Parameters
---------- ----------
optimize_mode: str
Optimize mode, 'maximize' or 'minimize'.
R: int R: int
the maximum amount of resource that can be allocated to a single configuration The maximum amount of budget that can be allocated to a single configuration.
Here, trial budget could mean the number of epochs, number of mini-batches, etc.,
depending on how users interpret it.
Each trial should use ``TRIAL_BUDGET`` to control how long it runs.
eta: int eta: int
the variable that controls the proportion of configurations discarded in each round of SuccessiveHalving The variable that controls the proportion of configurations discarded in each round of SuccessiveHalving.
optimize_mode: str ``1/eta`` configurations will survive and rerun using more budgets in each round.
optimize mode, 'maximize' or 'minimize'
exec_mode: str exec_mode: str
execution mode, 'serial' or 'parallelism' Execution mode, 'serial' or 'parallelism'.
If 'parallelism', the tuner will try to use available resources to start new bucket immediately.
If 'serial', the tuner will only start new bucket after the current bucket is done.
Notes
-----
First, Hyperband an example of how to write an autoML algorithm based on MsgDispatcherBase,
rather than based on Tuner and Assessor. Hyperband is implemented in this way
because it integrates the functions of both Tuner and Assessor,thus, we call it Advisor.
Second, this implementation fully leverages Hyperband's internal parallelism.
Specifically, the next bucket is not started strictly after the current bucket.
Instead, it starts when there are available resources. If you want to use full parallelism mode,
set ``exec_mode`` to ``parallelism``.
Or if you want to set ``exec_mode`` with ``serial`` according to the original algorithm.
In this mode, the next bucket will start strictly after the current bucket.
``parallelism`` mode may lead to multiple unfinished buckets,
in contrast, there is at most one unfinished bucket under ``serial`` mode.
The advantage of ``parallelism`` mode is to make full use of resources,
which may reduce the experiment duration multiple times.
""" """
def __init__(self, R=60, eta=3, optimize_mode='maximize', exec_mode='parallelism'): def __init__(self, optimize_mode='maximize', R=60, eta=3, exec_mode='parallelism'):
"""B = (s_max + 1)R""" """B = (s_max + 1)R"""
super(Hyperband, self).__init__() super(Hyperband, self).__init__()
self.R = R self.R = R
......
...@@ -191,23 +191,31 @@ class HyperoptClassArgsValidator(ClassArgsValidator): ...@@ -191,23 +191,31 @@ class HyperoptClassArgsValidator(ClassArgsValidator):
class HyperoptTuner(Tuner): class HyperoptTuner(Tuner):
""" """
HyperoptTuner is a tuner which using hyperopt algorithm. NNI wraps `hyperopt <https://github.com/hyperopt/hyperopt>`__ to provide anneal tuner.
This simple annealing algorithm begins by sampling from the prior
but tends over time to sample from points closer and closer to the best ones observed.
This algorithm is a simple variation of random search that leverages smoothness in the response surface.
The annealing rate is not adaptive.
Examples
--------
.. code-block::
config.tuner.name = 'Anneal'
config.tuner.class_args = {
'optimize_mode': 'minimize'
}
Parameters
----------
optimze_mode: 'minimize' or 'maximize'
Whether optimize to minimize or maximize trial result.
""" """
def __init__(self, algorithm_name, optimize_mode='minimize', def __init__(self, algorithm_name, optimize_mode='minimize',
parallel_optimize=False, constant_liar_type='min'): parallel_optimize=False, constant_liar_type='min'):
"""
Parameters
----------
algorithm_name : str
algorithm_name includes "tpe", "random_search" and anneal".
optimize_mode : str
parallel_optimize : bool
More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
constant_liar_type : str
constant_liar_type including "min", "max" and "mean"
More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
"""
self.algorithm_name = algorithm_name self.algorithm_name = algorithm_name
self.optimize_mode = OptimizeMode(optimize_mode) self.optimize_mode = OptimizeMode(optimize_mode)
self.json = None self.json = None
...@@ -238,15 +246,6 @@ class HyperoptTuner(Tuner): ...@@ -238,15 +246,6 @@ class HyperoptTuner(Tuner):
raise RuntimeError('Not support tuner algorithm in hyperopt.') raise RuntimeError('Not support tuner algorithm in hyperopt.')
def update_search_space(self, search_space): def update_search_space(self, search_space):
"""
Update search space definition in tuner by search_space in parameters.
Will called when first setup experiemnt or update search space in WebUI.
Parameters
----------
search_space : dict
"""
validate_search_space(search_space) validate_search_space(search_space)
self.json = search_space self.json = search_space
...@@ -266,22 +265,11 @@ class HyperoptTuner(Tuner): ...@@ -266,22 +265,11 @@ class HyperoptTuner(Tuner):
self.rval.catch_eval_exceptions = False self.rval.catch_eval_exceptions = False
def generate_parameters(self, parameter_id, **kwargs): def generate_parameters(self, parameter_id, **kwargs):
""" total_params = self._get_suggestion(random_search=False)
Returns a set of trial (hyper-)parameters, as a serializable object.
Parameters
----------
parameter_id : int
Returns
-------
params : dict
"""
total_params = self.get_suggestion(random_search=False)
# avoid generating same parameter with concurrent trials because hyperopt doesn't support parallel mode # avoid generating same parameter with concurrent trials because hyperopt doesn't support parallel mode
if total_params in self.total_data.values(): if total_params in self.total_data.values():
# but it can cause duplicate parameter rarely # but it can cause duplicate parameter rarely
total_params = self.get_suggestion(random_search=True) total_params = self._get_suggestion(random_search=True)
self.total_data[parameter_id] = total_params self.total_data[parameter_id] = total_params
if self.parallel: if self.parallel:
...@@ -291,17 +279,6 @@ class HyperoptTuner(Tuner): ...@@ -291,17 +279,6 @@ class HyperoptTuner(Tuner):
return params return params
def receive_trial_result(self, parameter_id, parameters, value, **kwargs): def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
"""
Record an observation of the objective function
Parameters
----------
parameter_id : int
parameters : dict
value : dict/float
if value is dict, it should have "default" key.
value is final metrics of the trial.
"""
reward = extract_scalar_reward(value) reward = extract_scalar_reward(value)
# restore the paramsters contains '_index' # restore the paramsters contains '_index'
if parameter_id not in self.total_data: if parameter_id not in self.total_data:
...@@ -369,7 +346,7 @@ class HyperoptTuner(Tuner): ...@@ -369,7 +346,7 @@ class HyperoptTuner(Tuner):
idxs[key] = [new_id] idxs[key] = [new_id]
vals[key] = [vals[key]] vals[key] = [vals[key]]
self.miscs_update_idxs_vals(rval_miscs, self._miscs_update_idxs_vals(rval_miscs,
idxs, idxs,
vals, vals,
idxs_map={new_id: new_id}, idxs_map={new_id: new_id},
...@@ -382,7 +359,7 @@ class HyperoptTuner(Tuner): ...@@ -382,7 +359,7 @@ class HyperoptTuner(Tuner):
trials.insert_trial_docs([trial]) trials.insert_trial_docs([trial])
trials.refresh() trials.refresh()
def miscs_update_idxs_vals(self, def _miscs_update_idxs_vals(self,
miscs, miscs,
idxs, idxs,
vals, vals,
...@@ -416,7 +393,7 @@ class HyperoptTuner(Tuner): ...@@ -416,7 +393,7 @@ class HyperoptTuner(Tuner):
misc_by_id[tid]['idxs'][key] = [tid] misc_by_id[tid]['idxs'][key] = [tid]
misc_by_id[tid]['vals'][key] = [val] misc_by_id[tid]['vals'][key] = [val]
def get_suggestion(self, random_search=False): def _get_suggestion(self, random_search=False):
""" """
get suggestion from hyperopt get suggestion from hyperopt
...@@ -469,14 +446,6 @@ class HyperoptTuner(Tuner): ...@@ -469,14 +446,6 @@ class HyperoptTuner(Tuner):
return total_params return total_params
def import_data(self, data): def import_data(self, data):
"""
Import additional data for tuning
Parameters
----------
data:
a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
"""
_completed_num = 0 _completed_num = 0
for trial_info in data: for trial_info in data:
logger.info("Importing data, current processing progress %s / %s", _completed_num, len(data)) logger.info("Importing data, current processing progress %s / %s", _completed_num, len(data))
......
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT license. # Licensed under the MIT license.
from __future__ import annotations
import logging import logging
from schema import Schema, Optional from schema import Schema, Optional
from nni import ClassArgsValidator from nni import ClassArgsValidator
from nni.assessor import Assessor, AssessResult from nni.assessor import Assessor, AssessResult
from nni.typehint import Literal
from nni.utils import extract_scalar_history from nni.utils import extract_scalar_history
logger = logging.getLogger('medianstop_Assessor') logger = logging.getLogger('medianstop_Assessor')
...@@ -18,18 +21,35 @@ class MedianstopClassArgsValidator(ClassArgsValidator): ...@@ -18,18 +21,35 @@ class MedianstopClassArgsValidator(ClassArgsValidator):
}).validate(kwargs) }).validate(kwargs)
class MedianstopAssessor(Assessor): class MedianstopAssessor(Assessor):
"""MedianstopAssessor is The median stopping rule stops a pending trial X at step S """
The median stopping rule stops a pending trial X at step S
if the trial’s best objective value by step S is strictly worse than the median value if the trial’s best objective value by step S is strictly worse than the median value
of the running averages of all completed trials’ objectives reported up to step S of the running averages of all completed trials’ objectives reported up to step S
Paper: `Google Vizer: A Service for Black-Box Optimization
<https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf>`__
Examples
--------
.. code-block::
config.assessor.name = 'Medianstop'
config.tuner.class_args = {
'optimize_mode': 'maximize',
'start_step': 5
}
Parameters Parameters
---------- ----------
optimize_mode : str optimize_mode
optimize mode, 'maximize' or 'minimize' Whether optimize to minimize or maximize trial result.
start_step : int start_step
only after receiving start_step number of reported intermediate results A trial is determined to be stopped or not
only after receiving start_step number of reported intermediate results.
""" """
def __init__(self, optimize_mode='maximize', start_step=0):
def __init__(self, optimize_mode: Literal['minimize', 'maximize'] = 'maximize', start_step: int = 0):
self._start_step = start_step self._start_step = start_step
self._running_history = dict() self._running_history = dict()
self._completed_avg_history = dict() self._completed_avg_history = dict()
...@@ -56,15 +76,6 @@ class MedianstopAssessor(Assessor): ...@@ -56,15 +76,6 @@ class MedianstopAssessor(Assessor):
self._running_history[trial_job_id].extend(trial_history[len(self._running_history[trial_job_id]):]) self._running_history[trial_job_id].extend(trial_history[len(self._running_history[trial_job_id]):])
def trial_end(self, trial_job_id, success): def trial_end(self, trial_job_id, success):
"""trial_end
Parameters
----------
trial_job_id : int
trial job id
success : bool
True if succssfully finish the experiment, False otherwise
"""
if trial_job_id in self._running_history: if trial_job_id in self._running_history:
if success: if success:
cnt = 0 cnt = 0
...@@ -79,25 +90,6 @@ class MedianstopAssessor(Assessor): ...@@ -79,25 +90,6 @@ class MedianstopAssessor(Assessor):
logger.warning('trial_end: trial_job_id does not exist in running_history') logger.warning('trial_end: trial_job_id does not exist in running_history')
def assess_trial(self, trial_job_id, trial_history): def assess_trial(self, trial_job_id, trial_history):
"""assess_trial
Parameters
----------
trial_job_id : int
trial job id
trial_history : list
The history performance matrix of each trial
Returns
-------
bool
AssessResult.Good or AssessResult.Bad
Raises
------
Exception
unrecognize exception in medianstop_assessor
"""
curr_step = len(trial_history) curr_step = len(trial_history)
if curr_step < self._start_step: if curr_step < self._start_step:
return AssessResult.Good return AssessResult.Good
......
...@@ -46,39 +46,74 @@ class MetisClassArgsValidator(ClassArgsValidator): ...@@ -46,39 +46,74 @@ class MetisClassArgsValidator(ClassArgsValidator):
class MetisTuner(Tuner): class MetisTuner(Tuner):
""" """
Metis Tuner `Metis tuner <https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/>`__ offers
several benefits over other tuning algorithms.
While most tools only predict the optimal configuration, Metis gives you two outputs,
a prediction for the optimal configuration and a suggestion for the next trial.
No more guess work!
More algorithm information you could reference here: While most tools assume training datasets do not have noisy data,
https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/ Metis actually tells you if you need to resample a particular hyper-parameter.
Attributes While most tools have problems of being exploitation-heavy,
Metis' search strategy balances exploration, exploitation, and (optional) resampling.
Metis belongs to the class of sequential model-based optimization (SMBO) algorithms
and it is based on the Bayesian Optimization framework. To model the parameter-vs-performance space,
Metis uses both a Gaussian Process and GMM. Since each trial can impose a high time cost,
Metis heavily trades inference computations with naive trials.
At each iteration, Metis does two tasks (refer to :footcite:t:`li2018metis` for details):
1. It finds the global optimal point in the Gaussian Process space.
This point represents the optimal configuration.
2. It identifies the next hyper-parameter candidate.
This is achieved by inferring the potential information gain of
exploration, exploitation, and resampling.
Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
``quniform``, ``uniform``, ``randint``, and numerical ``choice``.
Examples
--------
.. code-block::
config.tuner.name = 'MetisTuner'
config.tuner.class_args = {
'optimize_mode': 'maximize'
}
Parameters
---------- ----------
optimize_mode : str optimize_mode : str
optimize_mode is a string that including two mode "maximize" and "minimize" optimize_mode is a string that including two mode "maximize" and "minimize"
no_resampling : bool no_resampling : bool
True or False. True or False.
Should Metis consider re-sampling as part of the search strategy? Should Metis consider re-sampling as part of the search strategy?
If you are confident that the training dataset is noise-free, If you are confident that the training dataset is noise-free,
then you do not need re-sampling. then you do not need re-sampling.
no_candidates : bool no_candidates : bool
True or False. True or False.
Should Metis suggest parameters for the next benchmark? Should Metis suggest parameters for the next benchmark?
If you do not plan to do more benchmarks, If you do not plan to do more benchmarks,
Metis can skip this step. Metis can skip this step.
selection_num_starting_points : int selection_num_starting_points : int
How many times Metis should try to find the global optimal in the search space? How many times Metis should try to find the global optimal in the search space?
The higher the number, the longer it takes to output the solution. The higher the number, the longer it takes to output the solution.
cold_start_num : int cold_start_num : int
Metis need some trial result to get cold start. Metis need some trial result to get cold start.
when the number of trial result is less than when the number of trial result is less than
cold_start_num, Metis will randomly sample hyper-parameter for trial. cold_start_num, Metis will randomly sample hyper-parameter for trial.
exploration_probability: float exploration_probability: float
The probability of Metis to select parameter from exploration instead of exploitation. The probability of Metis to select parameter from exploration instead of exploitation.
""" """
def __init__( def __init__(
...@@ -89,43 +124,6 @@ class MetisTuner(Tuner): ...@@ -89,43 +124,6 @@ class MetisTuner(Tuner):
selection_num_starting_points=600, selection_num_starting_points=600,
cold_start_num=10, cold_start_num=10,
exploration_probability=0.9): exploration_probability=0.9):
"""
Parameters
----------
optimize_mode : str
optimize_mode is a string that including two mode "maximize" and "minimize"
no_resampling : bool
True or False.
Should Metis consider re-sampling as part of the search strategy?
If you are confident that the training dataset is noise-free,
then you do not need re-sampling.
no_candidates : bool
True or False.
Should Metis suggest parameters for the next benchmark?
If you do not plan to do more benchmarks,
Metis can skip this step.
selection_num_starting_points : int
How many times Metis should try to find the global optimal in the search space?
The higher the number, the longer it takes to output the solution.
cold_start_num : int
Metis need some trial result to get cold start.
when the number of trial result is less than
cold_start_num, Metis will randomly sample hyper-parameter for trial.
exploration_probability : float
The probability of Metis to select parameter from exploration instead of exploitation.
x_bounds : list
The constration of parameters.
x_types : list
The type of parameters.
"""
self.samples_x = [] self.samples_x = []
self.samples_y = [] self.samples_y = []
self.samples_y_aggregation = [] self.samples_y_aggregation = []
...@@ -141,7 +139,9 @@ class MetisTuner(Tuner): ...@@ -141,7 +139,9 @@ class MetisTuner(Tuner):
self.minimize_constraints_fun = None self.minimize_constraints_fun = None
self.minimize_starting_points = None self.minimize_starting_points = None
self.supplement_data_num = 0 self.supplement_data_num = 0
# The constration of parameters
self.x_bounds = [] self.x_bounds = []
# The type of parameters
self.x_types = [] self.x_types = []
......
...@@ -170,26 +170,91 @@ class PBTClassArgsValidator(ClassArgsValidator): ...@@ -170,26 +170,91 @@ class PBTClassArgsValidator(ClassArgsValidator):
}).validate(kwargs) }).validate(kwargs)
class PBTTuner(Tuner): class PBTTuner(Tuner):
"""
Population Based Training (PBT) comes from `Population Based Training of Neural Networks <https://arxiv.org/abs/1711.09846v1>`__.
It's a simple asynchronous optimization algorithm which effectively utilizes a fixed computational budget to jointly optimize
a population of models and their hyperparameters to maximize performance.
Importantly, PBT discovers a schedule of hyperparameter settings rather than following the generally sub-optimal strategy of
trying to find a single fixed set to use for the whole course of training.
.. image:: ../../img/pbt.jpg
PBTTuner initializes a population with several trials (i.e., ``population_size``).
There are four steps in the above figure, each trial only runs by one step. How long is one step is controlled by trial code,
e.g., one epoch. When a trial starts, it loads a checkpoint specified by PBTTuner and continues to run one step,
then saves checkpoint to a directory specified by PBTTuner and exits.
The trials in a population run steps synchronously, that is, after all the trials finish the ``i``-th step,
the ``(i+1)``-th step can be started. Exploitation and exploration of PBT are executed between two consecutive steps.
Two important steps to follow if you are trying to use PBTTuner:
1. **Provide checkpoint directory**. Since some trials need to load other trial's checkpoint,
users should provide a directory (i.e., ``all_checkpoint_dir``) which is accessible by every trial.
It is easy for local mode, users could directly use the default directory or specify any directory on the local machine.
For other training services, users should follow :doc:`the document of those training services <../experiment/training_service>`
to provide a directory in a shared storage, such as NFS, Azure storage.
2. **Modify your trial code**. Before running a step, a trial needs to load a checkpoint,
the checkpoint directory is specified in hyper-parameter configuration generated by PBTTuner,
i.e., ``params['load_checkpoint_dir']``. Similarly, the directory for saving checkpoint is also included in the configuration,
i.e., ``params['save_checkpoint_dir']``. Here, ``all_checkpoint_dir`` is base folder of ``load_checkpoint_dir``
and ``save_checkpoint_dir`` whose format is ``all_checkpoint_dir/<population-id>/<step>``.
.. code-block:: python
params = nni.get_next_parameter()
# the path of the checkpoint to load
load_path = os.path.join(params['load_checkpoint_dir'], 'model.pth')
# load checkpoint from `load_path`
...
# run one step
...
# the path for saving a checkpoint
save_path = os.path.join(params['save_checkpoint_dir'], 'model.pth')
# save checkpoint to `save_path`
...
The complete example code can be found :githublink:`here <examples/trials/mnist-pbt-tuner-pytorch>`.
Parameters
----------
optimize_mode : ``maximize`` or ``minimize``, default: ``maximize``
If ``maximize``, the tuner will target to maximize metrics. If ``minimize``, the tuner will target to minimize metrics.
all_checkpoint_dir : str
Directory for trials to load and save checkpoint.
If not specified, the directory would be ``~/nni/checkpoint/``.
Note that if the experiment is not local mode,
users should provide a path in a shared storage which can be accessed by all the trials.
population_size : int, default = 10
Number of trials in a population. Each step has this number of trials.
In our implementation, one step is running each trial by specific training epochs set by users.
factor : float, default = (1.2, 0.8)
Factors for perturbation of hyperparameters.
resample_probability : float, default = 0.25
Probability for resampling.
fraction : float, default = 0.2
Fraction for selecting bottom and top trials.
Examples
--------
Below is an example of PBTTuner configuration in experiment config file.
.. code-block:: yaml
tuner:
name: PBTTuner
classArgs:
optimize_mode: maximize
all_checkpoint_dir: /the/path/to/store/checkpoints
population_size: 10
Notes
-----
Assessor is not allowed if PBTTuner is used.
"""
def __init__(self, optimize_mode="maximize", all_checkpoint_dir=None, population_size=10, factor=0.2, def __init__(self, optimize_mode="maximize", all_checkpoint_dir=None, population_size=10, factor=0.2,
resample_probability=0.25, fraction=0.2): resample_probability=0.25, fraction=0.2):
"""
Initialization
Parameters
----------
optimize_mode : str
maximize or minimize
all_checkpoint_dir : str
directory to store training model checkpoint
population_size : int
number of trials for each epoch
factor : float
factor for perturbation
resample_probability : float
probability for resampling
fraction : float
fraction for selecting bottom and top trials
"""
self.optimize_mode = OptimizeMode(optimize_mode) self.optimize_mode = OptimizeMode(optimize_mode)
if all_checkpoint_dir is None: if all_checkpoint_dir is None:
all_checkpoint_dir = os.getenv('NNI_CHECKPOINT_DIRECTORY') all_checkpoint_dir = os.getenv('NNI_CHECKPOINT_DIRECTORY')
......
...@@ -306,40 +306,37 @@ class PPOClassArgsValidator(ClassArgsValidator): ...@@ -306,40 +306,37 @@ class PPOClassArgsValidator(ClassArgsValidator):
class PPOTuner(Tuner): class PPOTuner(Tuner):
""" """
PPOTuner, the implementation inherits the main logic of the implementation PPOTuner, the implementation inherits the main logic of the implementation
[ppo2 from openai](https://github.com/openai/baselines/tree/master/baselines/ppo2), and is adapted for NAS scenario. `ppo2 from openai <https://github.com/openai/baselines/tree/master/baselines/ppo2>`__ and is adapted for NAS scenario.
It uses ``lstm`` for its policy network and value network, policy and value share the same network. It uses ``lstm`` for its policy network and value network, policy and value share the same network.
Parameters
----------
optimize_mode : str
maximize or minimize
trials_per_update : int
Number of trials to have for each model update
epochs_per_update : int
Number of epochs to run for each model update
minibatch_size : int
Minibatch size (number of trials) for the update
ent_coef : float
Policy entropy coefficient in the optimization objective
lr : float
Learning rate of the model (lstm network), constant
vf_coef : float
Value function loss coefficient in the optimization objective
max_grad_norm : float
Gradient norm clipping coefficient
gamma : float
Discounting factor
lam : float
Advantage estimation discounting factor (lambda in the paper)
cliprange : float
Cliprange in the PPO algorithm, constant
""" """
def __init__(self, optimize_mode, trials_per_update=20, epochs_per_update=4, minibatch_size=4, def __init__(self, optimize_mode, trials_per_update=20, epochs_per_update=4, minibatch_size=4,
ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, cliprange=0.2): ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, cliprange=0.2):
"""
Initialization, PPO model is not initialized here as search space is not received yet.
Parameters
----------
optimize_mode : str
maximize or minimize
trials_per_update : int
Number of trials to have for each model update
epochs_per_update : int
Number of epochs to run for each model update
minibatch_size : int
Minibatch size (number of trials) for the update
ent_coef : float
Policy entropy coefficient in the optimization objective
lr : float
Learning rate of the model (lstm network), constant
vf_coef : float
Value function loss coefficient in the optimization objective
max_grad_norm : float
Gradient norm clipping coefficient
gamma : float
Discounting factor
lam : float
Advantage estimation discounting factor (lambda in the paper)
cliprange : float
Cliprange in the PPO algorithm, constant
"""
self.optimize_mode = OptimizeMode(optimize_mode) self.optimize_mode = OptimizeMode(optimize_mode)
self.model_config = ModelConfig() self.model_config = ModelConfig()
self.model = None self.model = None
......
...@@ -2,12 +2,14 @@ ...@@ -2,12 +2,14 @@
# Licensed under the MIT license. # Licensed under the MIT license.
""" """
Naive random tuner for hyper-parameter optimization. Naive random tuner.
You can specify an integer seed to determine random result. You can specify an integer seed to determine random result.
""" """
__all__ = ['RandomTuner', 'suggest', 'suggest_parameter'] from __future__ import annotations
__all__ = ['RandomTuner']
import logging import logging
...@@ -21,7 +23,26 @@ from nni.tuner import Tuner ...@@ -21,7 +23,26 @@ from nni.tuner import Tuner
_logger = logging.getLogger('nni.tuner.random') _logger = logging.getLogger('nni.tuner.random')
class RandomTuner(Tuner): class RandomTuner(Tuner):
def __init__(self, seed=None): """
A naive tuner that generates fully random hyperparameters.
Examples
--------
.. code-block::
config.tuner.name = 'Random'
config.tuner.class_args = {
'seed': 100
}
Parameters
----------
seed
The random seed.
"""
def __init__(self, seed: int | None = None):
self.space = None self.space = None
if seed is None: # explicitly generate a seed to make the experiment reproducible if seed is None: # explicitly generate a seed to make the experiment reproducible
seed = np.random.default_rng().integers(2 ** 31) seed = np.random.default_rng().integers(2 ** 31)
......
...@@ -38,20 +38,46 @@ class SMACClassArgsValidator(ClassArgsValidator): ...@@ -38,20 +38,46 @@ class SMACClassArgsValidator(ClassArgsValidator):
class SMACTuner(Tuner): class SMACTuner(Tuner):
""" """
This is a wrapper of [SMAC](https://github.com/automl/SMAC3) following NNI tuner interface. `SMAC <https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf>`__ is based on Sequential Model-Based Optimization (SMBO).
It only supports ``SMAC`` mode, and does not support the multiple instances of SMAC3 (i.e., It adapts the most prominent previously used model class (Gaussian stochastic process models)
the same configuration is run multiple times). and introduces the model class of random forests to SMBO in order to handle categorical parameters.
The SMAC supported by nni is a wrapper on `the SMAC3 github repo <https://github.com/automl/SMAC3>`__,
following NNI tuner interface :class:`nni.tuner.Tuner`. For algorithm details of SMAC, please refer to the paper
:footcite:t:`hutter2011sequential`.
Note that SMAC on nni only supports a subset of the types in
:doc:`search space </hpo/search_space>`:
``choice``, ``randint``, ``uniform``, ``loguniform``, and ``quniform``.
Note that SMAC needs additional installation using the following command:
.. code-block:: bash
pip install nni[SMAC]
``swig`` is required for SMAC. for Ubuntu ``swig`` can be installed with ``apt``.
Examples
--------
.. code-block::
config.tuner.name = 'SMAC'
config.tuner.class_args = {
'optimize_mode': 'maximize'
}
Parameters
----------
optimize_mode : str
Optimize mode, 'maximize' or 'minimize', by default 'maximize'
config_dedup : bool
If True, the tuner will not generate a configuration that has been already generated.
If False, a configuration may be generated twice, but it is rare for relatively large search space.
""" """
def __init__(self, optimize_mode="maximize", config_dedup=False): def __init__(self, optimize_mode="maximize", config_dedup=False):
"""
Parameters
----------
optimize_mode : str
Optimize mode, 'maximize' or 'minimize', by default 'maximize'
config_dedup : bool
If True, the tuner will not generate a configuration that has been already generated.
If False, a configuration may be generated twice, but it is rare for relatively large search space.
"""
self.logger = logger self.logger = logger
self.optimize_mode = OptimizeMode(optimize_mode) self.optimize_mode = OptimizeMode(optimize_mode)
self.total_data = {} self.total_data = {}
......
...@@ -2,26 +2,30 @@ ...@@ -2,26 +2,30 @@
# Licensed under the MIT license. # Licensed under the MIT license.
""" """
Tree-structured Parzen Estimator (TPE) tuner for hyper-parameter optimization. Tree-structured Parzen Estimator (TPE) tuner.
Paper: https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf Paper: https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf
Official code: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/tpe.py Official code: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/tpe.py
This is a slightly modified re-implementation of the algorithm. This is a slightly modified re-implementation of the algorithm.
""" """
__all__ = ['TpeTuner', 'TpeArguments', 'suggest', 'suggest_parameter'] from __future__ import annotations
__all__ = ['TpeTuner', 'TpeArguments']
from collections import defaultdict from collections import defaultdict
import logging import logging
import math import math
from typing import NamedTuple, Optional, Union from typing import Any, NamedTuple
import numpy as np import numpy as np
from scipy.special import erf # pylint: disable=no-name-in-module from scipy.special import erf # pylint: disable=no-name-in-module
from nni.tuner import Tuner
from nni.common.hpo_utils import OptimizeMode, format_search_space, deformat_parameters, format_parameters from nni.common.hpo_utils import OptimizeMode, format_search_space, deformat_parameters, format_parameters
from nni.tuner import Tuner
from nni.typehint import Literal
from nni.utils import extract_scalar_reward from nni.utils import extract_scalar_reward
from . import random_tuner from . import random_tuner
...@@ -31,12 +35,13 @@ _logger = logging.getLogger('nni.tuner.tpe') ...@@ -31,12 +35,13 @@ _logger = logging.getLogger('nni.tuner.tpe')
class TpeArguments(NamedTuple): class TpeArguments(NamedTuple):
""" """
These are the hyper-parameters of TPE algorithm itself. Hyperparameters of TPE algorithm itself.
To avoid confusing with trials' hyper-parameters, they are called "arguments" in this code.
To avoid confusing with trials' hyperparameters to be tuned, these are called "arguments" here.
Parameters Parameters
========== ----------
constant_liar_type: 'best' | 'worst' | 'mean' | None (default: 'best') constant_liar_type
TPE algorithm itself does not support parallel tuning. TPE algorithm itself does not support parallel tuning.
This parameter specifies how to optimize for trial_concurrency > 1. This parameter specifies how to optimize for trial_concurrency > 1.
...@@ -44,20 +49,21 @@ class TpeArguments(NamedTuple): ...@@ -44,20 +49,21 @@ class TpeArguments(NamedTuple):
How each liar works is explained in paper's section 6.1. How each liar works is explained in paper's section 6.1.
In general "best" suit for small trial number and "worst" suit for large trial number. In general "best" suit for small trial number and "worst" suit for large trial number.
(:doc:`experiment result </misc/parallelizing_tpe_search>`)
n_startup_jobs: int (default: 20) n_startup_jobs
The first N hyper-parameters are generated fully randomly for warming up. The first N hyperparameters are generated fully randomly for warming up.
If the search space is large, you can increase this value. If the search space is large, you can increase this value.
Or if max_trial_number is small, you may want to decrease it. Or if max_trial_number is small, you may want to decrease it.
n_ei_candidates: int (default: 24) n_ei_candidates
For each iteration TPE samples EI for N sets of parameters and choose the best one. (loosely speaking) For each iteration TPE samples EI for N sets of parameters and choose the best one. (loosely speaking)
linear_forgetting: int (default: 25) linear_forgetting
TPE will lower the weights of old trials. TPE will lower the weights of old trials.
This controls how many iterations it takes for a trial to start decay. This controls how many iterations it takes for a trial to start decay.
prior_weight: float (default: 1.0) prior_weight
TPE treats user provided search space as prior. TPE treats user provided search space as prior.
When generating new trials, it also incorporates the prior in trial history by transforming the search space to When generating new trials, it also incorporates the prior in trial history by transforming the search space to
one trial configuration (i.e., each parameter of this configuration chooses the mean of its candidate range). one trial configuration (i.e., each parameter of this configuration chooses the mean of its candidate range).
...@@ -66,11 +72,11 @@ class TpeArguments(NamedTuple): ...@@ -66,11 +72,11 @@ class TpeArguments(NamedTuple):
With prior weight 1.0, the search space is treated as one good trial. With prior weight 1.0, the search space is treated as one good trial.
For example, "normal(0, 1)" effectly equals to a trial with x = 0 which has yielded good result. For example, "normal(0, 1)" effectly equals to a trial with x = 0 which has yielded good result.
gamma: float (default: 0.25) gamma
Controls how many trials are considered "good". Controls how many trials are considered "good".
The number is calculated as "min(gamma * sqrt(N), linear_forgetting)". The number is calculated as "min(gamma * sqrt(N), linear_forgetting)".
""" """
constant_liar_type: Optional[str] = 'best' constant_liar_type: Literal['best', 'worst', 'mean'] | None = 'best'
n_startup_jobs: int = 20 n_startup_jobs: int = 20
n_ei_candidates: int = 24 n_ei_candidates: int = 24
linear_forgetting: int = 25 linear_forgetting: int = 25
...@@ -79,18 +85,68 @@ class TpeArguments(NamedTuple): ...@@ -79,18 +85,68 @@ class TpeArguments(NamedTuple):
class TpeTuner(Tuner): class TpeTuner(Tuner):
""" """
Tree-structured Parzen Estimator (TPE) tuner.
TPE is a lightweight tuner that has no extra dependency and supports all search space types,
designed to be the default tuner.
It has the drawback that TPE cannot discover relationship between different hyperparameters.
**Implementation**
TPE is an SMBO algorithm.
It models P(x|y) and P(y) where x represents hyperparameters and y the evaluation result.
P(x|y) is modeled by transforming the generative process of hyperparameters,
replacing the distributions of the configuration prior with non-parametric densities.
Paper: `Algorithms for Hyper-Parameter Optimization
<https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf>`__
Examples
--------
.. code-block::
## minimal config ##
config.tuner.name = 'TPE'
config.tuner.class_args = {
'optimize_mode': 'maximize'
}
.. code-block::
## advanced config ##
config.tuner.name = 'TPE'
config.tuner.class_args = {
'optimize_mode': maximize,
'seed': 12345,
'tpe_args': {
'constant_liar_type': 'mean',
'n_startup_jobs': 10,
'n_ei_candidates': 20,
'linear_forgetting': 100,
'prior_weight': 0,
'gamma': 0.5
}
}
Parameters Parameters
========== ----------
optimze_mode: 'minimize' | 'maximize' (default: 'minimize') optimze_mode: Literal['minimize', 'maximize']
Whether optimize to minimize or maximize trial result. Whether optimize to minimize or maximize trial result.
seed: int | None seed
The random seed. The random seed.
tpe_args: dict[string, Any] | None tpe_args
Advanced users can use this to customize TPE tuner. Advanced users can use this to customize TPE tuner.
See `TpeArguments` for details. See :class:`TpeArguments` for details.
""" """
def __init__(self, optimize_mode='minimize', seed=None, tpe_args=None): def __init__(self,
optimize_mode: Literal['minimize', 'maximize'] = 'minimize',
seed: int | None = None,
tpe_args: dict[str, Any] | None = None):
self.optimize_mode = OptimizeMode(optimize_mode) self.optimize_mode = OptimizeMode(optimize_mode)
self.args = TpeArguments(**(tpe_args or {})) self.args = TpeArguments(**(tpe_args or {}))
self.space = None self.space = None
...@@ -183,7 +239,7 @@ def suggest_parameter(args, rng, spec, parameter_history): ...@@ -183,7 +239,7 @@ def suggest_parameter(args, rng, spec, parameter_history):
## Utilities part ## ## Utilities part ##
class Record(NamedTuple): class Record(NamedTuple):
param: Union[int, float] param: int | float
loss: float loss: float
class BestLiar: # assume running parameters have best result, it accelerates "converging" class BestLiar: # assume running parameters have best result, it accelerates "converging"
...@@ -305,7 +361,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma): ...@@ -305,7 +361,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
this function is used for everything other than "choice" and "randint". this function is used for everything other than "choice" and "randint".
Parameters Parameters
========== ----------
args: TpeArguments args: TpeArguments
Algorithm arguments. Algorithm arguments.
history_mus: 1-d array of float history_mus: 1-d array of float
...@@ -317,7 +373,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma): ...@@ -317,7 +373,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
σ value of normal search space. σ value of normal search space.
Returns Returns
======= -------
Tuple of three 1-d float arrays: (weight, µ, σ). Tuple of three 1-d float arrays: (weight, µ, σ).
The tuple represents N+1 "vicinity of observations" and each one's weight, The tuple represents N+1 "vicinity of observations" and each one's weight,
......
...@@ -5,6 +5,7 @@ import copy ...@@ -5,6 +5,7 @@ import copy
import functools import functools
import inspect import inspect
import numbers import numbers
import os
import sys import sys
import types import types
import warnings import warnings
...@@ -257,6 +258,13 @@ def trace(cls_or_func: T = None, *, kw_only: bool = True, inheritable: bool = Fa ...@@ -257,6 +258,13 @@ def trace(cls_or_func: T = None, *, kw_only: bool = True, inheritable: bool = Fa
pass pass
""" """
# This is an internal flag to control the behavior of trace.
# Useful in doc build and tests.
# Might be changed in future.
nni_trace_flag = os.environ.get('NNI_TRACE_FLAG', '')
if nni_trace_flag.lower() == 'disable':
return cls_or_func
def wrap(cls_or_func): def wrap(cls_or_func):
# already annotated, do nothing # already annotated, do nothing
if is_wrapped_with_trace(cls_or_func): if is_wrapped_with_trace(cls_or_func):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment