Unverified Commit 51d261e7 authored by J-shang's avatar J-shang Committed by GitHub
Browse files

Merge pull request #4668 from microsoft/doc-refactor

parents d63a2ea3 b469e1c1
......@@ -124,7 +124,21 @@ class WeightScoreTrainerBasedDataCollector(TrainerBasedDataCollector):
class MovementPruner(BasicPruner):
"""
r"""
Movement pruner is an implementation of movement pruning.
This is a "fine-pruning" algorithm, which means the masks may change during each fine-tuning step.
Each weight element will be scored by the opposite of the sum of the product of weight and its gradient during each step.
This means the weight elements moving towards zero will accumulate negative scores, the weight elements moving away from zero will accumulate positive scores.
The weight elements with low scores will be masked during inference.
The following figure from the paper shows the weight pruning by movement pruning.
.. image:: ../../img/movement_pruning.png
:target: ../../img/movement_pruning.png
:alt:
For more details, please refer to `Movement Pruning: Adaptive Sparsity by Fine-Tuning <https://arxiv.org/abs/2005.07683>`__.
Parameters
----------
model : torch.nn.Module
......@@ -158,7 +172,7 @@ class MovementPruner(BasicPruner):
model.train(mode=training)
traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
The traced optimizer instance which the optimizer class is wrapped by nni.trace.
E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()).
E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
criterion : Callable[[Tensor, Tensor], Tensor]
The criterion function used in trainer. Take model output and target value as input, and return the loss.
training_epochs : int
......@@ -171,6 +185,21 @@ class MovementPruner(BasicPruner):
The number of steps at which sparsity stops growing, note that the sparsity stop growing doesn't mean masks not changed.
The sparsity after each `optimizer.step()` is:
total_sparsity * (1 - (1 - (current_step - warm_up_step) / (cool_down_beginning_step - warm_up_step)) ** 3).
Examples
--------
>>> import nni
>>> from nni.algorithms.compression.v2.pytorch.pruning import MovementPruner
>>> model = ...
>>> # make sure you have used nni.trace to wrap the optimizer class before initialize
>>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
>>> trainer = ...
>>> criterion = ...
>>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
>>> pruner = MovementPruner(model, config_list, trainer, traced_optimizer, criterion, 10, 3000, 27000)
>>> masked_model, masks = pruner.compress()
For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/movement_pruning_glue.py <examples/model_compress/pruning/v2/movement_pruning_glue.py>`
"""
def __init__(self, model: Module, config_list: List[Dict], trainer: Callable[[Module, Optimizer, Callable], None],
traced_optimizer: Traceable, criterion: Callable[[Tensor, Tensor], Tensor], training_epochs: int, warm_up_step: int,
......
......@@ -22,15 +22,14 @@ _logger = logging.getLogger(__name__)
class DataCollector:
"""
An abstract class for collect the data needed by the compressor.
Parameters
----------
compressor
The compressor binded with this DataCollector.
"""
def __init__(self, compressor: Compressor):
"""
Parameters
----------
compressor
The compressor binded with this DataCollector.
"""
self.compressor = compressor
def reset(self):
......@@ -242,42 +241,43 @@ class TrainerBasedDataCollector(DataCollector):
class MetricsCalculator:
"""
An abstract class for calculate a kind of metrics of the given data.
"""
def __init__(self, dim: Optional[Union[int, List[int]]] = None,
block_sparse_size: Optional[Union[int, List[int]]] = None):
"""
Parameters
----------
dim
The dimensions that corresponding to the under pruning weight dimensions in collected data.
None means one-to-one correspondence between pruned dimensions and data, which equal to set `dim` as all data dimensions.
Only these `dim` will be kept and other dimensions of the data will be reduced.
Example:
Parameters
----------
dim
The dimensions that corresponding to the under pruning weight dimensions in collected data.
None means one-to-one correspondence between pruned dimensions and data, which equal to set `dim` as all data dimensions.
Only these `dim` will be kept and other dimensions of the data will be reduced.
If you want to prune the Conv2d weight in filter level, and the weight size is (32, 16, 3, 3) [out-channel, in-channel, kernal-size-1, kernal-size-2].
Then the under pruning dimensions is [0], which means you want to prune the filter or out-channel.
Example:
Case 1: Directly collect the conv module weight as data to calculate the metric.
Then the data has size (32, 16, 3, 3).
Mention that the dimension 0 of the data is corresponding to the under pruning weight dimension 0.
So in this case, `dim=0` will set in `__init__`.
If you want to prune the Conv2d weight in filter level, and the weight size is (32, 16, 3, 3) [out-channel, in-channel, kernal-size-1, kernal-size-2].
Then the under pruning dimensions is [0], which means you want to prune the filter or out-channel.
Case 2: Use the output of the conv module as data to calculate the metric.
Then the data has size (batch_num, 32, feature_map_size_1, feature_map_size_2).
Mention that the dimension 1 of the data is corresponding to the under pruning weight dimension 0.
So in this case, `dim=1` will set in `__init__`.
Case 1: Directly collect the conv module weight as data to calculate the metric.
Then the data has size (32, 16, 3, 3).
Mention that the dimension 0 of the data is corresponding to the under pruning weight dimension 0.
So in this case, `dim=0` will set in `__init__`.
In both of these two case, the metric of this module has size (32,).
block_sparse_size
This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
Case 2: Use the output of the conv module as data to calculate the metric.
Then the data has size (batch_num, 32, feature_map_size_1, feature_map_size_2).
Mention that the dimension 1 of the data is corresponding to the under pruning weight dimension 0.
So in this case, `dim=1` will set in `__init__`.
Example:
In both of these two case, the metric of this module has size (32,).
The under pruning weight size is (768, 768), and you want to apply a block sparse on dim=[0] with block size [64, 768],
then you can set block_sparse_size=[64]. The final metric size is (12,).
"""
block_sparse_size
This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
Example:
The under pruning weight size is (768, 768), and you want to apply a block sparse on dim=[0] with block size [64, 768],
then you can set block_sparse_size=[64]. The final metric size is (12,).
"""
def __init__(self, dim: Optional[Union[int, List[int]]] = None,
block_sparse_size: Optional[Union[int, List[int]]] = None):
self.dim = dim if not isinstance(dim, int) else [dim]
self.block_sparse_size = block_sparse_size if not isinstance(block_sparse_size, int) else [block_sparse_size]
if self.block_sparse_size is not None:
......@@ -307,36 +307,35 @@ class MetricsCalculator:
class SparsityAllocator:
"""
An abstract class for allocate mask based on metrics.
Parameters
----------
pruner
The pruner that binded with this `SparsityAllocator`.
dim
The under pruning weight dimensions, which metric size should equal to the under pruning weight size on these dimensions.
None means one-to-one correspondence between pruned dimensions and metric, which equal to set `dim` as all under pruning weight dimensions.
The mask will expand to the weight size depend on `dim`.
Example:
The under pruning weight has size (2, 3, 4), and `dim=1` means the under pruning weight dimension is 1.
Then the metric should have a size (3,), i.e., `metric=[0.9, 0.1, 0.8]`.
Assuming by some kind of `SparsityAllocator` get the mask on weight dimension 1 `mask=[1, 0, 1]`,
then the dimension mask will expand to the final mask `[[[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]]]`.
block_sparse_size
This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
Example:
The metric size is (12,), and block_sparse_size=[64], then the mask will expand to (768,) at first before expand with `dim`.
continuous_mask
Inherit the mask already in the wrapper if set True.
"""
def __init__(self, pruner: Compressor, dim: Optional[Union[int, List[int]]] = None,
block_sparse_size: Optional[Union[int, List[int]]] = None, continuous_mask: bool = True):
"""
Parameters
----------
pruner
The pruner that binded with this `SparsityAllocator`.
dim
The under pruning weight dimensions, which metric size should equal to the under pruning weight size on these dimensions.
None means one-to-one correspondence between pruned dimensions and metric, which equal to set `dim` as all under pruning weight dimensions.
The mask will expand to the weight size depend on `dim`.
Example:
The under pruning weight has size (2, 3, 4), and `dim=1` means the under pruning weight dimension is 1.
Then the metric should have a size (3,), i.e., `metric=[0.9, 0.1, 0.8]`.
Assuming by some kind of `SparsityAllocator` get the mask on weight dimension 1 `mask=[1, 0, 1]`,
then the dimension mask will expand to the final mask `[[[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1]]]`.
block_sparse_size
This used to describe the block size a metric value represented. By default, None means the block size is ones(len(dim)).
Make sure len(dim) == len(block_sparse_size), and the block_sparse_size dimension position is corresponding to dim.
Example:
The metric size is (12,), and block_sparse_size=[64], then the mask will expand to (768,) at first before expand with `dim`.
continuous_mask
Inherit the mask already in the wrapper if set True.
"""
self.pruner = pruner
self.dim = dim if not isinstance(dim, int) else [dim]
self.block_sparse_size = block_sparse_size if not isinstance(block_sparse_size, int) else [block_sparse_size]
......
......@@ -200,6 +200,17 @@ def compute_sparsity(origin_model: Module, compact_model: Module, compact_model_
The compact model is the origin model after pruning,
and it may have different structure with origin_model cause of speed up.
Parameters
----------
origin_model : torch.nn.Module
The original un-pruned model.
compact_model : torch.nn.Module
The model after speed up or original model.
compact_model_masks: Dict[str, Dict[str, Tensor]]
The masks applied on the compact model, if the original model have been speed up, this should be {}.
config_list : List[Dict]
The config_list used by pruning the original model.
Returns
-------
Tuple[List[Dict], List[Dict], List[Dict]]
......
......@@ -20,27 +20,64 @@ LOGGER = logging.getLogger('batch_tuner_AutoML')
class BatchTuner(Tuner):
"""
BatchTuner is tuner will running all the configure that user want to run batchly.
Batch tuner is a special tuner that allows users to simply provide several hyperparameter sets,
and it will evaluate each set.
Batch tuner does **not** support standard search space.
Search space of batch tuner looks like a single ``choice`` in standard search space,
but it has different meaning.
Consider following search space:
.. code-block::
'combine_params': {
'_type': 'choice',
'_value': [
{'x': 0, 'y': 1},
{'x': 1, 'y': 2},
{'x': 1, 'y': 3},
]
}
Batch tuner will generate following 4 hyperparameter sets:
1. {'x': 0, 'y': 1}
2. {'x': 1, 'y': 2}
3. {'x': 1, 'y': 3}
If this search space was used with grid search tuner, it would instead generate:
1. {'combine_params': {'x': 0, 'y': 1 }}
2. {'combine_params': {'x': 1, 'y': 2 }}
3. {'combine_params': {'x': 1, 'y': 3 }}
Examples
--------
The search space only be accepted like:
::
{'combine_params':
{ '_type': 'choice',
'_value': '[{...}, {...}, {...}]',
}
.. code-block::
config.search_space = {
'combine_params': {
'_type': 'choice',
'_value': [
{'optimizer': 'Adam', 'learning_rate': 0.001},
{'optimizer': 'Adam', 'learning_rate': 0.0001},
{'optimizer': 'Adam', 'learning_rate': 0.00001},
{'optimizer': 'SGD', 'learning_rate': 0.01},
{'optimizer': 'SGD', 'learning_rate': 0.005},
]
}
}
config.tuner.name = 'BatchTuner'
"""
def __init__(self):
self._count = -1
self._values = []
def is_valid(self, search_space):
def _is_valid(self, search_space):
"""
Check the search space is valid: only contains 'choice' type
......@@ -70,27 +107,10 @@ class BatchTuner(Tuner):
return None
def update_search_space(self, search_space):
"""Update the search space
Parameters
----------
search_space : dict
"""
validate_search_space(search_space, ['choice'])
self._values = self.is_valid(search_space)
self._values = self._is_valid(search_space)
def generate_parameters(self, parameter_id, **kwargs):
"""Returns a dict of trial (hyper-)parameters, as a serializable object.
Parameters
----------
parameter_id : int
Returns
-------
dict
A candidate parameter group.
"""
self._count += 1
if self._count > len(self._values) - 1:
raise nni.NoMoreTrialError('no more parameters now.')
......@@ -100,13 +120,6 @@ class BatchTuner(Tuner):
pass
def import_data(self, data):
"""Import additional data for tuning
Parameters
----------
data:
a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
"""
if not self._values:
LOGGER.info("Search space has not been initialized, skip this data import")
return
......
......@@ -249,20 +249,52 @@ class BOHBClassArgsValidator(ClassArgsValidator):
class BOHB(MsgDispatcherBase):
"""
BOHB performs robust and efficient hyperparameter optimization
at scale by combining the speed of Hyperband searches with the
guidance and guarantees of convergence of Bayesian Optimization.
Instead of sampling new configurations at random, BOHB uses
kernel density estimators to select promising candidates.
`BOHB <https://arxiv.org/abs/1807.01774>`__ is a robust and efficient hyperparameter tuning algorithm at scale.
BO is an abbreviation for "Bayesian Optimization" and HB is an abbreviation for "Hyperband".
BOHB relies on HB (Hyperband) to determine how many configurations to evaluate with which budget,
but it replaces the random selection of configurations at the beginning of each HB iteration
by a model-based search (Bayesian Optimization).
Once the desired number of configurations for the iteration is reached,
the standard successive halving procedure is carried out using these configurations.
It keeps track of the performance of all function evaluations g(x, b) of configurations x
on all budgets b to use as a basis for our models in later iterations.
Please refer to the paper :footcite:t:`falkner2018bohb` for detailed algorithm.
Note that BOHB needs additional installation using the following command:
.. code-block:: bash
pip install nni[BOHB]
Examples
--------
.. code-block::
config.advisor.name = 'BOHB'
config.advisor.class_args = {
'optimize_mode': 'maximize',
'min_budget': 1,
'max_budget': 27,
'eta': 3,
'min_points_in_model': 7,
'top_n_percent': 15,
'num_samples': 64,
'random_fraction': 0.33,
'bandwidth_factor': 3.0,
'min_bandwidth': 0.001
}
Parameters
----------
optimize_mode: str
optimize mode, 'maximize' or 'minimize'
Optimize mode, 'maximize' or 'minimize'.
min_budget: float
The smallest budget to consider. Needs to be positive!
The smallest budget to assign to a trial job, (budget can be the number of mini-batches or epochs).
Needs to be positive.
max_budget: float
The largest budget to consider. Needs to be larger than min_budget!
The largest budget to assign to a trial job. Needs to be larger than min_budget.
The budgets will be geometrically distributed
:math:`a^2 + b^2 = c^2 \\sim \\eta^k` for :math:`k\\in [0, 1, ... , num\\_subsets - 1]`.
eta: int
......@@ -271,21 +303,102 @@ class BOHB(MsgDispatcherBase):
1/eta of them 'advances' to the next round.
Must be greater or equal to 2.
min_points_in_model: int
number of observations to start building a KDE. Default 'None' means
dim+1, the bare minimum.
Number of observations to start building a KDE. Default 'None' means dim+1;
when the number of completed trials in this budget is equal to or larger than ``max{dim+1, min_points_in_model}``,
BOHB will start to build a KDE model of this budget then use said KDE model to guide configuration selection.
Needs to be positive. (dim means the number of hyperparameters in search space)
top_n_percent: int
percentage ( between 1 and 99, default 15) of the observations that are considered good.
Percentage (between 1 and 99, default 15) of the observations which are considered good.
Good points and bad points are used for building KDE models.
For example, if you have 100 observed trials and top_n_percent is 15,
then the top 15% of points will be used for building the good points models "l(x)".
The remaining 85% of points will be used for building the bad point models "g(x)".
num_samples: int
number of samples to optimize EI (default 64)
Number of samples to optimize EI (default 64).
In this case, it will sample "num_samples" points and compare the result of l(x)/g(x).
Then it will return the one with the maximum l(x)/g(x) value as the next configuration
if the optimize_mode is ``maximize``. Otherwise, it returns the smallest one.
random_fraction: float
fraction of purely random configurations that are sampled from the
prior without the model.
Fraction of purely random configurations that are sampled from the prior without the model.
bandwidth_factor: float
to encourage diversity, the points proposed to optimize EI, are sampled
from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3)
To encourage diversity, the points proposed to optimize EI are sampled
from a 'widened' KDE where the bandwidth is multiplied by this factor (default: 3).
It is suggested to use the default value if you are not familiar with KDE.
min_bandwidth: float
to keep diversity, even when all (good) samples have the same value for one of the parameters,
a minimum bandwidth (Default: 1e-3) is used instead of zero.
To keep diversity, even when all (good) samples have the same value for one of the parameters,
a minimum bandwidth (default: 1e-3) is used instead of zero.
It is suggested to use the default value if you are not familiar with KDE.
config_space: str
Directly use a .pcs file serialized by `ConfigSpace <https://automl.github.io/ConfigSpace/>` in "pcs new" format.
In this case, search space file (if provided in config) will be ignored.
Note that this path needs to be an absolute path. Relative path is currently not supported.
Notes
-----
Below is the introduction of the BOHB process separated in two parts:
**The first part HB (Hyperband).**
BOHB follows Hyperband’s way of choosing the budgets and continue to use SuccessiveHalving.
For more details, you can refer to the :class:`nni.algorithms.hpo.hyperband_advisor.Hyperband`
and the `reference paper for Hyperband <https://arxiv.org/abs/1603.06560>`__.
This procedure is summarized by the pseudocode below.
.. image:: ../../img/bohb_1.png
:scale: 80 %
:align: center
**The second part BO (Bayesian Optimization)**
The BO part of BOHB closely resembles TPE with one major difference:
It opted for a single multidimensional KDE compared to the hierarchy of one-dimensional KDEs used in TPE
in order to better handle interaction effects in the input space.
Tree Parzen Estimator(TPE): uses a KDE (kernel density estimator) to model the densities.
.. image:: ../../img/bohb_2.png
:scale: 80 %
:align: center
To fit useful KDEs, we require a minimum number of data points Nmin;
this is set to d + 1 for our experiments, where d is the number of hyperparameters.
To build a model as early as possible, we do not wait until Nb = \|Db\|,
where the number of observations for budget b is large enough to satisfy q · Nb ≥ Nmin.
Instead, after initializing with Nmin + 2 random configurations, we choose the
best and worst configurations, respectively, to model the two densities.
Note that it also samples a constant fraction named **random fraction** of the configurations uniformly at random.
.. image:: ../../img/bohb_3.png
:scale: 80 %
:align: center
.. image:: ../../img/bohb_6.jpg
:scale: 65 %
:align: center
**The above image shows the workflow of BOHB.**
Here set max_budget = 9, min_budget = 1, eta = 3, others as default.
In this case, s_max = 2, so we will continuously run the {s=2, s=1, s=0, s=2, s=1, s=0, ...} cycle.
In each stage of SuccessiveHalving (the orange box), it will pick the top 1/eta configurations and run them again with more budget,
repeating the SuccessiveHalving stage until the end of this iteration.
At the same time, it collects the configurations, budgets and final metrics of each trial
and use these to build a multidimensional KDEmodel with the key "budget".
Multidimensional KDE is used to guide the selection of configurations for the next iteration.
The sampling procedure (using Multidimensional KDE to guide selection) is summarized by the pseudocode below.
.. image:: ../../img/bohb_4.png
:scale: 80 %
:align: center
**Here is a simple experiment which tunes MNIST with BOHB.**
Code implementation: :githublink:`examples/trials/mnist-advisor <examples/trials/mnist-advisor>`
The following is the experimental final results:
.. image:: ../../img/bohb_5.png
:scale: 80 %
:align: center
More experimental results can be found in the `reference paper <https://arxiv.org/abs/1807.01774>`__.
It shows that BOHB makes good use of previous results and has a balanced trade-off in exploration and exploitation.
"""
def __init__(self,
......
......@@ -22,18 +22,52 @@ class CurvefittingClassArgsValidator(ClassArgsValidator):
}).validate(kwargs)
class CurvefittingAssessor(Assessor):
"""CurvefittingAssessor uses learning curve fitting algorithm to predict the learning curve performance in the future.
"""
CurvefittingAssessor uses learning curve fitting algorithm to predict the learning curve performance in the future.
The intermediate result **must** be accuracy. Curve fitting does not support minimizing loss.
Curve fitting assessor is an LPA (learning, predicting, assessing) algorithm.
It stops a pending trial X at step S if the trial's forecast result at target step is convergence and lower than the
best performance in the history.
Paper: `Speeding up Automatic Hyperparameter Optimization of Deep Neural Networks by Extrapolation of Learning Curves
<https://ml.informatik.uni-freiburg.de/wp-content/uploads/papers/15-IJCAI-Extrapolation_of_Learning_Curves.pdf>`__
Examples
--------
.. code-block::
config.assessor.name = 'Curvefitting'
config.tuner.class_args = {
'epoch_num': 20,
'start_step': 6,
'threshold': 9,
'gap': 1,
}
Parameters
----------
epoch_num : int
The total number of epoch
The total number of epochs.
We need to know the number of epochs to determine which points we need to predict.
start_step : int
only after receiving start_step number of reported intermediate results
A trial is determined to be stopped or not only after receiving start_step number of intermediate results.
threshold : float
The threshold that we decide to early stop the worse performance curve.
The threshold that we use to decide to early stop the worst performance curve.
For example: if threshold = 0.95, and the best performance in the history is 0.9,
then we will stop the trial who's predicted value is lower than 0.95 * 0.9 = 0.855.
gap : int
The gap interval between assessor judgements.
For example: if gap = 2, start_step = 6,
then we will assess the result when we get 6, 8, 10, 12, ... intermediate results.
"""
def __init__(self, epoch_num=20, start_step=6, threshold=0.95, gap=1):
......@@ -56,15 +90,6 @@ class CurvefittingAssessor(Assessor):
logger.info('Successfully initials the curvefitting assessor')
def trial_end(self, trial_job_id, success):
"""update the best performance of completed trial job
Parameters
----------
trial_job_id : int
trial job id
success : bool
True if succssfully finish the experiment, False otherwise
"""
if success:
if self.set_best_performance:
self.completed_best_performance = max(self.completed_best_performance, self.trial_history[-1])
......@@ -76,25 +101,6 @@ class CurvefittingAssessor(Assessor):
logger.info('No need to update, trial job id: %s', trial_job_id)
def assess_trial(self, trial_job_id, trial_history):
"""assess whether a trial should be early stop by curve fitting algorithm
Parameters
----------
trial_job_id : int
trial job id
trial_history : list
The history performance matrix of each trial
Returns
-------
bool
AssessResult.Good or AssessResult.Bad
Raises
------
Exception
unrecognize exception in curvefitting_assessor
"""
scalar_trial_history = extract_scalar_history(trial_history)
self.trial_history = scalar_trial_history
if not self.set_best_performance:
......
......@@ -44,7 +44,20 @@ def _random_config(search_space, random_state):
class DNGOTuner(Tuner):
"""
Use neural networks as an alternative to GPs to model distributions over functions in bayesian optimization.
Parameters
----------
optimize : maximize | minimize, default = maximize
If 'maximize', the tuner will target to maximize metrics. If 'minimize', the tuner will target to minimize metrics.
sample_size : int, default = 1000
Number of samples to select in each iteration. The best one will be picked from the samples as the next trial.
trials_per_update : int, default = 20
Number of trials to collect before updating the model.
num_epochs_per_training : int, default = 500
Number of epochs to train DNGO model.
"""
def __init__(self, optimize_mode='maximize', sample_size=1000, trials_per_update=20, num_epochs_per_training=500):
self.searchspace_json = None
self.random_state = None
......
......@@ -4,6 +4,7 @@
"""
evolution_tuner.py
"""
from __future__ import annotations
import copy
import random
......@@ -22,28 +23,19 @@ logger = logging.getLogger(__name__)
class Individual:
"""
Indicidual class to store the indv info.
Individual class to store the indv info.
Attributes
Parameters
----------
config : str
config : str, default = None
Search space.
info : str
info : str, default = None
The str to save information of individual.
result : float
result : float, None = None
The final metric of a individual.
"""
def __init__(self, config=None, info=None, result=None):
"""
Parameters
----------
config : str
A config to represent a group of parameters.
info : str
result : float
save_dir : str
"""
self.config = config
self.result = result
self.info = info
......@@ -61,18 +53,36 @@ class EvolutionClassArgsValidator(ClassArgsValidator):
class EvolutionTuner(Tuner):
"""
EvolutionTuner is tuner using navie evolution algorithm.
Naive Evolution comes from `Large-Scale Evolution of Image Classifiers <https://arxiv.org/pdf/1703.01041.pdf>`__
It randomly initializes a population based on the search space.
For each generation, it chooses better ones and does some mutation.
(e.g., changes a hyperparameter, adds/removes one layer, etc.) on them to get the next generation.
Naive Evolution requires many trials to works but it’s very simple and it’s easily expanded with new features.
Examples
--------
.. code-block::
config.tuner.name = 'Evolution'
config.tuner.class_args = {
'optimize_mode': 'maximize',
'population_size': 100
}
Parameters
----------
optimize_mode: str
Optimize mode, 'maximize' or 'minimize'.
If 'maximize', the tuner will try to maximize metrics. If 'minimize', the tuner will try to minimize metrics.
population_size: int
The initial size of the population (trial num) in the evolution tuner(default=32).
The larger population size, the better evolution performance.
It's suggested that ``population_size`` be much larger than ``concurrency`` so users can get the most out of the algorithm.
And at least ``concurrency``, or the tuner will fail on its first generation of parameters.
"""
def __init__(self, optimize_mode="maximize", population_size=32):
"""
Parameters
----------
optimize_mode : str, default 'maximize'
population_size : int
initial population size. The larger population size,
the better evolution performance.
"""
def __init__(self, optimize_mode='maximize', population_size=32):
self.optimize_mode = OptimizeMode(optimize_mode)
self.population_size = population_size
......@@ -89,11 +99,11 @@ class EvolutionTuner(Tuner):
def update_search_space(self, search_space):
"""
Update search space.
Search_space contains the information that user pre-defined.
Parameters
----------
search_space : dict
"""
self.searchspace_json = search_space
......@@ -109,8 +119,10 @@ class EvolutionTuner(Tuner):
"""
To deal with trial failure. If a trial fails,
random generate the parameters and add into the population.
Parameters
----------
parameter_id : int
Unique identifier for hyper-parameters used by this trial.
success : bool
......@@ -136,12 +148,15 @@ class EvolutionTuner(Tuner):
def generate_multiple_parameters(self, parameter_id_list, **kwargs):
"""
Returns multiple sets of trial (hyper-)parameters, as iterable of serializable objects.
Parameters
----------
parameter_id_list : list of int
Unique identifiers for each set of requested hyper-parameters.
**kwargs
Not used
Returns
-------
list
......@@ -182,12 +197,13 @@ class EvolutionTuner(Tuner):
Parameters
----------
parameter_id : int
Returns
-------
dict
A group of candaidte parameters that evolution tuner generated.
A group of candidate parameters that evolution tuner generated.
"""
pos = -1
......@@ -234,10 +250,12 @@ class EvolutionTuner(Tuner):
Parameters
----------
parameter_id : int
Returns
-------
dict
One newly generated configuration.
"""
......@@ -258,6 +276,7 @@ class EvolutionTuner(Tuner):
Parameters
----------
parameter_id : int
parameters : dict
value : dict/float
......
......@@ -41,29 +41,77 @@ class GPClassArgsValidator(ClassArgsValidator):
class GPTuner(Tuner):
"""
GPTuner is a Bayesian Optimization method where Gaussian Process is used for modeling loss functions.
GPTuner is a Bayesian Optimization method where Gaussian Process
is used for modeling loss functions.
Bayesian optimization works by constructing a posterior distribution of functions
(a Gaussian Process) that best describes the function you want to optimize.
As the number of observations grows, the posterior distribution improves,
and the algorithm becomes more certain of which regions in parameter space
are worth exploring and which are not.
GPTuner is designed to minimize/maximize the number of steps required to find
a combination of parameters that are close to the optimal combination.
To do so, this method uses a proxy optimization problem (finding the maximum of
the acquisition function) that, albeit still a hard problem, is cheaper
(in the computational sense) to solve, and it's amenable to common tools.
Therefore, Bayesian Optimization is suggested for situations where sampling the function
to be optimized is very expensive.
Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
``randint``, ``uniform``, ``quniform``, ``loguniform``, ``qloguniform``, and numerical ``choice``.
This optimization approach is described in Section 3 of the paper
`Algorithms for Hyper-Parameter Optimization <https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf>`__
( :footcite:t:`bergstra2011algorithms` ).
Examples
--------
.. code-block::
config.tuner.name = 'GPTuner'
config.tuner.class_args = {
'optimize_mode': 'maximize',
'utility': 'ei',
'kappa': 5.0,
'xi': 0.0,
'nu': 2.5,
'alpha': 1e-6,
'cold_start_num': 10,
'selection_num_warm_up': 100000,
'selection_num_starting_points': 250
}
Parameters
----------
optimize_mode : str
optimize mode, 'maximize' or 'minimize', by default 'maximize'
Optimize mode, 'maximize' or 'minimize', by default 'maximize'
utility : str
utility function (also called 'acquisition funcition') to use, which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
Utility function (also called 'acquisition funcition') to use,
which can be 'ei', 'ucb' or 'poi'. By default 'ei'.
kappa : float
value used by utility function 'ucb'. The bigger kappa is, the more the tuner will be exploratory. By default 5.
Value used by utility function 'ucb'. The bigger kappa is,
the more the tuner will be exploratory. By default 5.
xi : float
used by utility function 'ei' and 'poi'. The bigger xi is, the more the tuner will be exploratory. By default 0.
Used by utility function 'ei' and 'poi'. The bigger xi is,
the more the tuner will be exploratory. By default 0.
nu : float
used to specify Matern kernel. The smaller nu, the less smooth the approximated function is. By default 2.5.
Used to specify Matern kernel. The smaller nu,
the less smooth the approximated function is. By default 2.5.
alpha : float
Used to specify Gaussian Process Regressor. Larger values correspond to increased noise level in the observations.
Used to specify Gaussian Process Regressor.
Larger values correspond to increased noise level in the observations.
By default 1e-6.
cold_start_num : int
Number of random exploration to perform before Gaussian Process. By default 10.
Number of random exploration to perform before Gaussian Process.
By default 10.
selection_num_warm_up : int
Number of random points to evaluate for getting the point which maximizes the acquisition function. By default 100000
Number of random points to evaluate for getting the point which
maximizes the acquisition function. By default 100000
selection_num_starting_points : int
Number of times to run L-BFGS-B from a random starting point after the warmup. By default 250.
Number of times to run L-BFGS-B from a random starting point after the warmup.
By default 250.
"""
def __init__(self, optimize_mode="maximize", utility='ei', kappa=5, xi=0, nu=2.5, alpha=1e-6, cold_start_num=10,
......
......@@ -2,14 +2,10 @@
# Licensed under the MIT license.
"""
Grid search tuner for hyper-parameter optimization.
Grid search tuner.
For categorical parameters this tuner fully explore all combinations.
For numerical parameters it samples them at progressively decreased intervals.
Use this tuner if you have abundant resource and want to find strictly optimal parameters.
Grid search tuner has no argument.
"""
__all__ = ['GridSearchTuner']
......@@ -63,6 +59,35 @@ _logger = logging.getLogger('nni.tuner.gridsearch')
##
class GridSearchTuner(Tuner):
"""
Grid search tuner divides search space into evenly spaced grid, and performs brute-force traverse.
Recommended when the search space is small, or if you want to find strictly optimal hyperparameters.
**Implementation**
The original grid search approach performs an exhaustive search through a space consists of ``choice`` and ``randint``.
NNI's implementation extends grid search to support all search spaces types.
When the search space contains continuous parameters like ``normal`` and ``loguniform``,
grid search tuner works in following steps:
1. Divide the search space into a grid.
2. Perform an exhaustive searth through the grid.
3. Subdivide the grid into a finer-grained new grid.
4. Goto step 2, until experiment end.
As a deterministic algorithm, grid search has no argument.
Examples
--------
.. code-block::
config.tuner.name = 'GridSearch'
"""
def __init__(self):
self.space = None
......
......@@ -105,7 +105,8 @@ def json2parameter(ss_spec, random_state):
class Bracket():
"""A bracket in Hyperband, all the information of a bracket is managed by an instance of this class
"""
A bracket in Hyperband, all the information of a bracket is managed by an instance of this class
Parameters
----------
......@@ -267,24 +268,136 @@ class HyperbandClassArgsValidator(ClassArgsValidator):
class Hyperband(MsgDispatcherBase):
"""
Hyperband inherit from MsgDispatcherBase rather than Tuner, because it integrates both tuner's functions and assessor's functions.
This is an implementation that could fully leverage available resources or follow the algorithm process,
i.e., high parallelism or serial.
A single execution of Hyperband takes a finite budget of (s_max + 1)B.
`Hyperband <https://arxiv.org/pdf/1603.06560.pdf>`__ is a multi-fidelity hyperparameter tuning algorithm
based on successive halving.
The basic idea of Hyperband is to create several buckets,
each having ``n`` randomly generated hyperparameter configurations,
each configuration using ``r`` resources (e.g., epoch number, batch number).
After the ``n`` configurations are finished, it chooses the top ``n/eta`` configurations
and runs them using increased ``r*eta`` resources.
At last, it chooses the best configuration it has found so far.
Please refer to the paper :footcite:t:`li2017hyperband` for detailed algorithm.
Examples
--------
.. code-block::
config.advisor.name = 'Hyperband'
config.advisor.class_args = {
'optimize_mode': 'maximize',
'R': 60,
'eta': 3
}
Note that once you use Advisor, you are not allowed to add a Tuner and Assessor spec in the config file.
When Hyperband is used, the dict returned by :func:`nni.get_next_parameter` one more key
called ``TRIAL_BUDGET`` besides the hyperparameters and their values.
**With this TRIAL_BUDGET, users can control in trial code how long a trial runs by following
the suggested trial budget from Hyperband.** ``TRIAL_BUDGET`` is a relative number,
users can interpret them as number of epochs, number of mini-batches, running time, etc.
Here is a concrete example of ``R=81`` and ``eta=3``:
.. list-table::
:header-rows: 1
:widths: auto
* -
- s=4
- s=3
- s=2
- s=1
- s=0
* - i
- n r
- n r
- n r
- n r
- n r
* - 0
- 81 1
- 27 3
- 9 9
- 6 27
- 5 81
* - 1
- 27 3
- 9 9
- 3 27
- 2 81
-
* - 2
- 9 9
- 3 27
- 1 81
-
-
* - 3
- 3 27
- 1 81
-
-
-
* - 4
- 1 81
-
-
-
-
``s`` means bucket, ``n`` means the number of configurations that are generated,
the corresponding ``r`` means how many budgets these configurations run.
``i`` means round, for example, bucket 4 has 5 rounds, bucket 3 has 4 rounds.
A complete example can be found :githublink:`examples/trials/mnist-advisor`.
Parameters
----------
optimize_mode: str
Optimize mode, 'maximize' or 'minimize'.
R: int
the maximum amount of resource that can be allocated to a single configuration
The maximum amount of budget that can be allocated to a single configuration.
Here, trial budget could mean the number of epochs, number of mini-batches, etc.,
depending on how users interpret it.
Each trial should use ``TRIAL_BUDGET`` to control how long it runs.
eta: int
the variable that controls the proportion of configurations discarded in each round of SuccessiveHalving
optimize_mode: str
optimize mode, 'maximize' or 'minimize'
The variable that controls the proportion of configurations discarded in each round of SuccessiveHalving.
``1/eta`` configurations will survive and rerun using more budgets in each round.
exec_mode: str
execution mode, 'serial' or 'parallelism'
Execution mode, 'serial' or 'parallelism'.
If 'parallelism', the tuner will try to use available resources to start new bucket immediately.
If 'serial', the tuner will only start new bucket after the current bucket is done.
Notes
-----
First, Hyperband an example of how to write an autoML algorithm based on MsgDispatcherBase,
rather than based on Tuner and Assessor. Hyperband is implemented in this way
because it integrates the functions of both Tuner and Assessor,thus, we call it Advisor.
Second, this implementation fully leverages Hyperband's internal parallelism.
Specifically, the next bucket is not started strictly after the current bucket.
Instead, it starts when there are available resources. If you want to use full parallelism mode,
set ``exec_mode`` to ``parallelism``.
Or if you want to set ``exec_mode`` with ``serial`` according to the original algorithm.
In this mode, the next bucket will start strictly after the current bucket.
``parallelism`` mode may lead to multiple unfinished buckets,
in contrast, there is at most one unfinished bucket under ``serial`` mode.
The advantage of ``parallelism`` mode is to make full use of resources,
which may reduce the experiment duration multiple times.
"""
def __init__(self, R=60, eta=3, optimize_mode='maximize', exec_mode='parallelism'):
def __init__(self, optimize_mode='maximize', R=60, eta=3, exec_mode='parallelism'):
"""B = (s_max + 1)R"""
super(Hyperband, self).__init__()
self.R = R
......
......@@ -191,23 +191,31 @@ class HyperoptClassArgsValidator(ClassArgsValidator):
class HyperoptTuner(Tuner):
"""
HyperoptTuner is a tuner which using hyperopt algorithm.
NNI wraps `hyperopt <https://github.com/hyperopt/hyperopt>`__ to provide anneal tuner.
This simple annealing algorithm begins by sampling from the prior
but tends over time to sample from points closer and closer to the best ones observed.
This algorithm is a simple variation of random search that leverages smoothness in the response surface.
The annealing rate is not adaptive.
Examples
--------
.. code-block::
config.tuner.name = 'Anneal'
config.tuner.class_args = {
'optimize_mode': 'minimize'
}
Parameters
----------
optimze_mode: 'minimize' or 'maximize'
Whether optimize to minimize or maximize trial result.
"""
def __init__(self, algorithm_name, optimize_mode='minimize',
parallel_optimize=False, constant_liar_type='min'):
"""
Parameters
----------
algorithm_name : str
algorithm_name includes "tpe", "random_search" and anneal".
optimize_mode : str
parallel_optimize : bool
More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
constant_liar_type : str
constant_liar_type including "min", "max" and "mean"
More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
"""
self.algorithm_name = algorithm_name
self.optimize_mode = OptimizeMode(optimize_mode)
self.json = None
......@@ -238,15 +246,6 @@ class HyperoptTuner(Tuner):
raise RuntimeError('Not support tuner algorithm in hyperopt.')
def update_search_space(self, search_space):
"""
Update search space definition in tuner by search_space in parameters.
Will called when first setup experiemnt or update search space in WebUI.
Parameters
----------
search_space : dict
"""
validate_search_space(search_space)
self.json = search_space
......@@ -266,22 +265,11 @@ class HyperoptTuner(Tuner):
self.rval.catch_eval_exceptions = False
def generate_parameters(self, parameter_id, **kwargs):
"""
Returns a set of trial (hyper-)parameters, as a serializable object.
Parameters
----------
parameter_id : int
Returns
-------
params : dict
"""
total_params = self.get_suggestion(random_search=False)
total_params = self._get_suggestion(random_search=False)
# avoid generating same parameter with concurrent trials because hyperopt doesn't support parallel mode
if total_params in self.total_data.values():
# but it can cause duplicate parameter rarely
total_params = self.get_suggestion(random_search=True)
total_params = self._get_suggestion(random_search=True)
self.total_data[parameter_id] = total_params
if self.parallel:
......@@ -291,17 +279,6 @@ class HyperoptTuner(Tuner):
return params
def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
"""
Record an observation of the objective function
Parameters
----------
parameter_id : int
parameters : dict
value : dict/float
if value is dict, it should have "default" key.
value is final metrics of the trial.
"""
reward = extract_scalar_reward(value)
# restore the paramsters contains '_index'
if parameter_id not in self.total_data:
......@@ -369,7 +346,7 @@ class HyperoptTuner(Tuner):
idxs[key] = [new_id]
vals[key] = [vals[key]]
self.miscs_update_idxs_vals(rval_miscs,
self._miscs_update_idxs_vals(rval_miscs,
idxs,
vals,
idxs_map={new_id: new_id},
......@@ -382,7 +359,7 @@ class HyperoptTuner(Tuner):
trials.insert_trial_docs([trial])
trials.refresh()
def miscs_update_idxs_vals(self,
def _miscs_update_idxs_vals(self,
miscs,
idxs,
vals,
......@@ -416,7 +393,7 @@ class HyperoptTuner(Tuner):
misc_by_id[tid]['idxs'][key] = [tid]
misc_by_id[tid]['vals'][key] = [val]
def get_suggestion(self, random_search=False):
def _get_suggestion(self, random_search=False):
"""
get suggestion from hyperopt
......@@ -469,14 +446,6 @@ class HyperoptTuner(Tuner):
return total_params
def import_data(self, data):
"""
Import additional data for tuning
Parameters
----------
data:
a list of dictionarys, each of which has at least two keys, 'parameter' and 'value'
"""
_completed_num = 0
for trial_info in data:
logger.info("Importing data, current processing progress %s / %s", _completed_num, len(data))
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from __future__ import annotations
import logging
from schema import Schema, Optional
from nni import ClassArgsValidator
from nni.assessor import Assessor, AssessResult
from nni.typehint import Literal
from nni.utils import extract_scalar_history
logger = logging.getLogger('medianstop_Assessor')
......@@ -18,18 +21,35 @@ class MedianstopClassArgsValidator(ClassArgsValidator):
}).validate(kwargs)
class MedianstopAssessor(Assessor):
"""MedianstopAssessor is The median stopping rule stops a pending trial X at step S
"""
The median stopping rule stops a pending trial X at step S
if the trial’s best objective value by step S is strictly worse than the median value
of the running averages of all completed trials’ objectives reported up to step S
Paper: `Google Vizer: A Service for Black-Box Optimization
<https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf>`__
Examples
--------
.. code-block::
config.assessor.name = 'Medianstop'
config.tuner.class_args = {
'optimize_mode': 'maximize',
'start_step': 5
}
Parameters
----------
optimize_mode : str
optimize mode, 'maximize' or 'minimize'
start_step : int
only after receiving start_step number of reported intermediate results
optimize_mode
Whether optimize to minimize or maximize trial result.
start_step
A trial is determined to be stopped or not
only after receiving start_step number of reported intermediate results.
"""
def __init__(self, optimize_mode='maximize', start_step=0):
def __init__(self, optimize_mode: Literal['minimize', 'maximize'] = 'maximize', start_step: int = 0):
self._start_step = start_step
self._running_history = dict()
self._completed_avg_history = dict()
......@@ -56,15 +76,6 @@ class MedianstopAssessor(Assessor):
self._running_history[trial_job_id].extend(trial_history[len(self._running_history[trial_job_id]):])
def trial_end(self, trial_job_id, success):
"""trial_end
Parameters
----------
trial_job_id : int
trial job id
success : bool
True if succssfully finish the experiment, False otherwise
"""
if trial_job_id in self._running_history:
if success:
cnt = 0
......@@ -79,25 +90,6 @@ class MedianstopAssessor(Assessor):
logger.warning('trial_end: trial_job_id does not exist in running_history')
def assess_trial(self, trial_job_id, trial_history):
"""assess_trial
Parameters
----------
trial_job_id : int
trial job id
trial_history : list
The history performance matrix of each trial
Returns
-------
bool
AssessResult.Good or AssessResult.Bad
Raises
------
Exception
unrecognize exception in medianstop_assessor
"""
curr_step = len(trial_history)
if curr_step < self._start_step:
return AssessResult.Good
......
......@@ -46,39 +46,74 @@ class MetisClassArgsValidator(ClassArgsValidator):
class MetisTuner(Tuner):
"""
Metis Tuner
`Metis tuner <https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/>`__ offers
several benefits over other tuning algorithms.
While most tools only predict the optimal configuration, Metis gives you two outputs,
a prediction for the optimal configuration and a suggestion for the next trial.
No more guess work!
More algorithm information you could reference here:
https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
While most tools assume training datasets do not have noisy data,
Metis actually tells you if you need to resample a particular hyper-parameter.
Attributes
While most tools have problems of being exploitation-heavy,
Metis' search strategy balances exploration, exploitation, and (optional) resampling.
Metis belongs to the class of sequential model-based optimization (SMBO) algorithms
and it is based on the Bayesian Optimization framework. To model the parameter-vs-performance space,
Metis uses both a Gaussian Process and GMM. Since each trial can impose a high time cost,
Metis heavily trades inference computations with naive trials.
At each iteration, Metis does two tasks (refer to :footcite:t:`li2018metis` for details):
1. It finds the global optimal point in the Gaussian Process space.
This point represents the optimal configuration.
2. It identifies the next hyper-parameter candidate.
This is achieved by inferring the potential information gain of
exploration, exploitation, and resampling.
Note that the only acceptable types in the :doc:`search space </hpo/search_space>` are
``quniform``, ``uniform``, ``randint``, and numerical ``choice``.
Examples
--------
.. code-block::
config.tuner.name = 'MetisTuner'
config.tuner.class_args = {
'optimize_mode': 'maximize'
}
Parameters
----------
optimize_mode : str
optimize_mode is a string that including two mode "maximize" and "minimize"
no_resampling : bool
True or False.
Should Metis consider re-sampling as part of the search strategy?
If you are confident that the training dataset is noise-free,
then you do not need re-sampling.
no_candidates : bool
True or False.
Should Metis suggest parameters for the next benchmark?
If you do not plan to do more benchmarks,
Metis can skip this step.
selection_num_starting_points : int
How many times Metis should try to find the global optimal in the search space?
The higher the number, the longer it takes to output the solution.
cold_start_num : int
Metis need some trial result to get cold start.
when the number of trial result is less than
cold_start_num, Metis will randomly sample hyper-parameter for trial.
exploration_probability: float
The probability of Metis to select parameter from exploration instead of exploitation.
optimize_mode : str
optimize_mode is a string that including two mode "maximize" and "minimize"
no_resampling : bool
True or False.
Should Metis consider re-sampling as part of the search strategy?
If you are confident that the training dataset is noise-free,
then you do not need re-sampling.
no_candidates : bool
True or False.
Should Metis suggest parameters for the next benchmark?
If you do not plan to do more benchmarks,
Metis can skip this step.
selection_num_starting_points : int
How many times Metis should try to find the global optimal in the search space?
The higher the number, the longer it takes to output the solution.
cold_start_num : int
Metis need some trial result to get cold start.
when the number of trial result is less than
cold_start_num, Metis will randomly sample hyper-parameter for trial.
exploration_probability: float
The probability of Metis to select parameter from exploration instead of exploitation.
"""
def __init__(
......@@ -89,43 +124,6 @@ class MetisTuner(Tuner):
selection_num_starting_points=600,
cold_start_num=10,
exploration_probability=0.9):
"""
Parameters
----------
optimize_mode : str
optimize_mode is a string that including two mode "maximize" and "minimize"
no_resampling : bool
True or False.
Should Metis consider re-sampling as part of the search strategy?
If you are confident that the training dataset is noise-free,
then you do not need re-sampling.
no_candidates : bool
True or False.
Should Metis suggest parameters for the next benchmark?
If you do not plan to do more benchmarks,
Metis can skip this step.
selection_num_starting_points : int
How many times Metis should try to find the global optimal in the search space?
The higher the number, the longer it takes to output the solution.
cold_start_num : int
Metis need some trial result to get cold start.
when the number of trial result is less than
cold_start_num, Metis will randomly sample hyper-parameter for trial.
exploration_probability : float
The probability of Metis to select parameter from exploration instead of exploitation.
x_bounds : list
The constration of parameters.
x_types : list
The type of parameters.
"""
self.samples_x = []
self.samples_y = []
self.samples_y_aggregation = []
......@@ -141,7 +139,9 @@ class MetisTuner(Tuner):
self.minimize_constraints_fun = None
self.minimize_starting_points = None
self.supplement_data_num = 0
# The constration of parameters
self.x_bounds = []
# The type of parameters
self.x_types = []
......
......@@ -170,26 +170,91 @@ class PBTClassArgsValidator(ClassArgsValidator):
}).validate(kwargs)
class PBTTuner(Tuner):
"""
Population Based Training (PBT) comes from `Population Based Training of Neural Networks <https://arxiv.org/abs/1711.09846v1>`__.
It's a simple asynchronous optimization algorithm which effectively utilizes a fixed computational budget to jointly optimize
a population of models and their hyperparameters to maximize performance.
Importantly, PBT discovers a schedule of hyperparameter settings rather than following the generally sub-optimal strategy of
trying to find a single fixed set to use for the whole course of training.
.. image:: ../../img/pbt.jpg
PBTTuner initializes a population with several trials (i.e., ``population_size``).
There are four steps in the above figure, each trial only runs by one step. How long is one step is controlled by trial code,
e.g., one epoch. When a trial starts, it loads a checkpoint specified by PBTTuner and continues to run one step,
then saves checkpoint to a directory specified by PBTTuner and exits.
The trials in a population run steps synchronously, that is, after all the trials finish the ``i``-th step,
the ``(i+1)``-th step can be started. Exploitation and exploration of PBT are executed between two consecutive steps.
Two important steps to follow if you are trying to use PBTTuner:
1. **Provide checkpoint directory**. Since some trials need to load other trial's checkpoint,
users should provide a directory (i.e., ``all_checkpoint_dir``) which is accessible by every trial.
It is easy for local mode, users could directly use the default directory or specify any directory on the local machine.
For other training services, users should follow :doc:`the document of those training services <../experiment/training_service>`
to provide a directory in a shared storage, such as NFS, Azure storage.
2. **Modify your trial code**. Before running a step, a trial needs to load a checkpoint,
the checkpoint directory is specified in hyper-parameter configuration generated by PBTTuner,
i.e., ``params['load_checkpoint_dir']``. Similarly, the directory for saving checkpoint is also included in the configuration,
i.e., ``params['save_checkpoint_dir']``. Here, ``all_checkpoint_dir`` is base folder of ``load_checkpoint_dir``
and ``save_checkpoint_dir`` whose format is ``all_checkpoint_dir/<population-id>/<step>``.
.. code-block:: python
params = nni.get_next_parameter()
# the path of the checkpoint to load
load_path = os.path.join(params['load_checkpoint_dir'], 'model.pth')
# load checkpoint from `load_path`
...
# run one step
...
# the path for saving a checkpoint
save_path = os.path.join(params['save_checkpoint_dir'], 'model.pth')
# save checkpoint to `save_path`
...
The complete example code can be found :githublink:`here <examples/trials/mnist-pbt-tuner-pytorch>`.
Parameters
----------
optimize_mode : ``maximize`` or ``minimize``, default: ``maximize``
If ``maximize``, the tuner will target to maximize metrics. If ``minimize``, the tuner will target to minimize metrics.
all_checkpoint_dir : str
Directory for trials to load and save checkpoint.
If not specified, the directory would be ``~/nni/checkpoint/``.
Note that if the experiment is not local mode,
users should provide a path in a shared storage which can be accessed by all the trials.
population_size : int, default = 10
Number of trials in a population. Each step has this number of trials.
In our implementation, one step is running each trial by specific training epochs set by users.
factor : float, default = (1.2, 0.8)
Factors for perturbation of hyperparameters.
resample_probability : float, default = 0.25
Probability for resampling.
fraction : float, default = 0.2
Fraction for selecting bottom and top trials.
Examples
--------
Below is an example of PBTTuner configuration in experiment config file.
.. code-block:: yaml
tuner:
name: PBTTuner
classArgs:
optimize_mode: maximize
all_checkpoint_dir: /the/path/to/store/checkpoints
population_size: 10
Notes
-----
Assessor is not allowed if PBTTuner is used.
"""
def __init__(self, optimize_mode="maximize", all_checkpoint_dir=None, population_size=10, factor=0.2,
resample_probability=0.25, fraction=0.2):
"""
Initialization
Parameters
----------
optimize_mode : str
maximize or minimize
all_checkpoint_dir : str
directory to store training model checkpoint
population_size : int
number of trials for each epoch
factor : float
factor for perturbation
resample_probability : float
probability for resampling
fraction : float
fraction for selecting bottom and top trials
"""
self.optimize_mode = OptimizeMode(optimize_mode)
if all_checkpoint_dir is None:
all_checkpoint_dir = os.getenv('NNI_CHECKPOINT_DIRECTORY')
......
......@@ -306,40 +306,37 @@ class PPOClassArgsValidator(ClassArgsValidator):
class PPOTuner(Tuner):
"""
PPOTuner, the implementation inherits the main logic of the implementation
[ppo2 from openai](https://github.com/openai/baselines/tree/master/baselines/ppo2), and is adapted for NAS scenario.
`ppo2 from openai <https://github.com/openai/baselines/tree/master/baselines/ppo2>`__ and is adapted for NAS scenario.
It uses ``lstm`` for its policy network and value network, policy and value share the same network.
Parameters
----------
optimize_mode : str
maximize or minimize
trials_per_update : int
Number of trials to have for each model update
epochs_per_update : int
Number of epochs to run for each model update
minibatch_size : int
Minibatch size (number of trials) for the update
ent_coef : float
Policy entropy coefficient in the optimization objective
lr : float
Learning rate of the model (lstm network), constant
vf_coef : float
Value function loss coefficient in the optimization objective
max_grad_norm : float
Gradient norm clipping coefficient
gamma : float
Discounting factor
lam : float
Advantage estimation discounting factor (lambda in the paper)
cliprange : float
Cliprange in the PPO algorithm, constant
"""
def __init__(self, optimize_mode, trials_per_update=20, epochs_per_update=4, minibatch_size=4,
ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, cliprange=0.2):
"""
Initialization, PPO model is not initialized here as search space is not received yet.
Parameters
----------
optimize_mode : str
maximize or minimize
trials_per_update : int
Number of trials to have for each model update
epochs_per_update : int
Number of epochs to run for each model update
minibatch_size : int
Minibatch size (number of trials) for the update
ent_coef : float
Policy entropy coefficient in the optimization objective
lr : float
Learning rate of the model (lstm network), constant
vf_coef : float
Value function loss coefficient in the optimization objective
max_grad_norm : float
Gradient norm clipping coefficient
gamma : float
Discounting factor
lam : float
Advantage estimation discounting factor (lambda in the paper)
cliprange : float
Cliprange in the PPO algorithm, constant
"""
self.optimize_mode = OptimizeMode(optimize_mode)
self.model_config = ModelConfig()
self.model = None
......
......@@ -2,12 +2,14 @@
# Licensed under the MIT license.
"""
Naive random tuner for hyper-parameter optimization.
Naive random tuner.
You can specify an integer seed to determine random result.
"""
__all__ = ['RandomTuner', 'suggest', 'suggest_parameter']
from __future__ import annotations
__all__ = ['RandomTuner']
import logging
......@@ -21,7 +23,26 @@ from nni.tuner import Tuner
_logger = logging.getLogger('nni.tuner.random')
class RandomTuner(Tuner):
def __init__(self, seed=None):
"""
A naive tuner that generates fully random hyperparameters.
Examples
--------
.. code-block::
config.tuner.name = 'Random'
config.tuner.class_args = {
'seed': 100
}
Parameters
----------
seed
The random seed.
"""
def __init__(self, seed: int | None = None):
self.space = None
if seed is None: # explicitly generate a seed to make the experiment reproducible
seed = np.random.default_rng().integers(2 ** 31)
......
......@@ -38,20 +38,46 @@ class SMACClassArgsValidator(ClassArgsValidator):
class SMACTuner(Tuner):
"""
This is a wrapper of [SMAC](https://github.com/automl/SMAC3) following NNI tuner interface.
It only supports ``SMAC`` mode, and does not support the multiple instances of SMAC3 (i.e.,
the same configuration is run multiple times).
`SMAC <https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf>`__ is based on Sequential Model-Based Optimization (SMBO).
It adapts the most prominent previously used model class (Gaussian stochastic process models)
and introduces the model class of random forests to SMBO in order to handle categorical parameters.
The SMAC supported by nni is a wrapper on `the SMAC3 github repo <https://github.com/automl/SMAC3>`__,
following NNI tuner interface :class:`nni.tuner.Tuner`. For algorithm details of SMAC, please refer to the paper
:footcite:t:`hutter2011sequential`.
Note that SMAC on nni only supports a subset of the types in
:doc:`search space </hpo/search_space>`:
``choice``, ``randint``, ``uniform``, ``loguniform``, and ``quniform``.
Note that SMAC needs additional installation using the following command:
.. code-block:: bash
pip install nni[SMAC]
``swig`` is required for SMAC. for Ubuntu ``swig`` can be installed with ``apt``.
Examples
--------
.. code-block::
config.tuner.name = 'SMAC'
config.tuner.class_args = {
'optimize_mode': 'maximize'
}
Parameters
----------
optimize_mode : str
Optimize mode, 'maximize' or 'minimize', by default 'maximize'
config_dedup : bool
If True, the tuner will not generate a configuration that has been already generated.
If False, a configuration may be generated twice, but it is rare for relatively large search space.
"""
def __init__(self, optimize_mode="maximize", config_dedup=False):
"""
Parameters
----------
optimize_mode : str
Optimize mode, 'maximize' or 'minimize', by default 'maximize'
config_dedup : bool
If True, the tuner will not generate a configuration that has been already generated.
If False, a configuration may be generated twice, but it is rare for relatively large search space.
"""
self.logger = logger
self.optimize_mode = OptimizeMode(optimize_mode)
self.total_data = {}
......
......@@ -2,26 +2,30 @@
# Licensed under the MIT license.
"""
Tree-structured Parzen Estimator (TPE) tuner for hyper-parameter optimization.
Tree-structured Parzen Estimator (TPE) tuner.
Paper: https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf
Official code: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/tpe.py
This is a slightly modified re-implementation of the algorithm.
"""
__all__ = ['TpeTuner', 'TpeArguments', 'suggest', 'suggest_parameter']
from __future__ import annotations
__all__ = ['TpeTuner', 'TpeArguments']
from collections import defaultdict
import logging
import math
from typing import NamedTuple, Optional, Union
from typing import Any, NamedTuple
import numpy as np
from scipy.special import erf # pylint: disable=no-name-in-module
from nni.tuner import Tuner
from nni.common.hpo_utils import OptimizeMode, format_search_space, deformat_parameters, format_parameters
from nni.tuner import Tuner
from nni.typehint import Literal
from nni.utils import extract_scalar_reward
from . import random_tuner
......@@ -31,12 +35,13 @@ _logger = logging.getLogger('nni.tuner.tpe')
class TpeArguments(NamedTuple):
"""
These are the hyper-parameters of TPE algorithm itself.
To avoid confusing with trials' hyper-parameters, they are called "arguments" in this code.
Hyperparameters of TPE algorithm itself.
To avoid confusing with trials' hyperparameters to be tuned, these are called "arguments" here.
Parameters
==========
constant_liar_type: 'best' | 'worst' | 'mean' | None (default: 'best')
----------
constant_liar_type
TPE algorithm itself does not support parallel tuning.
This parameter specifies how to optimize for trial_concurrency > 1.
......@@ -44,20 +49,21 @@ class TpeArguments(NamedTuple):
How each liar works is explained in paper's section 6.1.
In general "best" suit for small trial number and "worst" suit for large trial number.
(:doc:`experiment result </misc/parallelizing_tpe_search>`)
n_startup_jobs: int (default: 20)
The first N hyper-parameters are generated fully randomly for warming up.
n_startup_jobs
The first N hyperparameters are generated fully randomly for warming up.
If the search space is large, you can increase this value.
Or if max_trial_number is small, you may want to decrease it.
n_ei_candidates: int (default: 24)
n_ei_candidates
For each iteration TPE samples EI for N sets of parameters and choose the best one. (loosely speaking)
linear_forgetting: int (default: 25)
linear_forgetting
TPE will lower the weights of old trials.
This controls how many iterations it takes for a trial to start decay.
prior_weight: float (default: 1.0)
prior_weight
TPE treats user provided search space as prior.
When generating new trials, it also incorporates the prior in trial history by transforming the search space to
one trial configuration (i.e., each parameter of this configuration chooses the mean of its candidate range).
......@@ -66,11 +72,11 @@ class TpeArguments(NamedTuple):
With prior weight 1.0, the search space is treated as one good trial.
For example, "normal(0, 1)" effectly equals to a trial with x = 0 which has yielded good result.
gamma: float (default: 0.25)
gamma
Controls how many trials are considered "good".
The number is calculated as "min(gamma * sqrt(N), linear_forgetting)".
"""
constant_liar_type: Optional[str] = 'best'
constant_liar_type: Literal['best', 'worst', 'mean'] | None = 'best'
n_startup_jobs: int = 20
n_ei_candidates: int = 24
linear_forgetting: int = 25
......@@ -79,18 +85,68 @@ class TpeArguments(NamedTuple):
class TpeTuner(Tuner):
"""
Tree-structured Parzen Estimator (TPE) tuner.
TPE is a lightweight tuner that has no extra dependency and supports all search space types,
designed to be the default tuner.
It has the drawback that TPE cannot discover relationship between different hyperparameters.
**Implementation**
TPE is an SMBO algorithm.
It models P(x|y) and P(y) where x represents hyperparameters and y the evaluation result.
P(x|y) is modeled by transforming the generative process of hyperparameters,
replacing the distributions of the configuration prior with non-parametric densities.
Paper: `Algorithms for Hyper-Parameter Optimization
<https://proceedings.neurips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf>`__
Examples
--------
.. code-block::
## minimal config ##
config.tuner.name = 'TPE'
config.tuner.class_args = {
'optimize_mode': 'maximize'
}
.. code-block::
## advanced config ##
config.tuner.name = 'TPE'
config.tuner.class_args = {
'optimize_mode': maximize,
'seed': 12345,
'tpe_args': {
'constant_liar_type': 'mean',
'n_startup_jobs': 10,
'n_ei_candidates': 20,
'linear_forgetting': 100,
'prior_weight': 0,
'gamma': 0.5
}
}
Parameters
==========
optimze_mode: 'minimize' | 'maximize' (default: 'minimize')
----------
optimze_mode: Literal['minimize', 'maximize']
Whether optimize to minimize or maximize trial result.
seed: int | None
seed
The random seed.
tpe_args: dict[string, Any] | None
tpe_args
Advanced users can use this to customize TPE tuner.
See `TpeArguments` for details.
See :class:`TpeArguments` for details.
"""
def __init__(self, optimize_mode='minimize', seed=None, tpe_args=None):
def __init__(self,
optimize_mode: Literal['minimize', 'maximize'] = 'minimize',
seed: int | None = None,
tpe_args: dict[str, Any] | None = None):
self.optimize_mode = OptimizeMode(optimize_mode)
self.args = TpeArguments(**(tpe_args or {}))
self.space = None
......@@ -183,7 +239,7 @@ def suggest_parameter(args, rng, spec, parameter_history):
## Utilities part ##
class Record(NamedTuple):
param: Union[int, float]
param: int | float
loss: float
class BestLiar: # assume running parameters have best result, it accelerates "converging"
......@@ -305,7 +361,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
this function is used for everything other than "choice" and "randint".
Parameters
==========
----------
args: TpeArguments
Algorithm arguments.
history_mus: 1-d array of float
......@@ -317,7 +373,7 @@ def adaptive_parzen_normal(args, history_mus, prior_mu, prior_sigma):
σ value of normal search space.
Returns
=======
-------
Tuple of three 1-d float arrays: (weight, µ, σ).
The tuple represents N+1 "vicinity of observations" and each one's weight,
......
......@@ -5,6 +5,7 @@ import copy
import functools
import inspect
import numbers
import os
import sys
import types
import warnings
......@@ -257,6 +258,13 @@ def trace(cls_or_func: T = None, *, kw_only: bool = True, inheritable: bool = Fa
pass
"""
# This is an internal flag to control the behavior of trace.
# Useful in doc build and tests.
# Might be changed in future.
nni_trace_flag = os.environ.get('NNI_TRACE_FLAG', '')
if nni_trace_flag.lower() == 'disable':
return cls_or_func
def wrap(cls_or_func):
# already annotated, do nothing
if is_wrapped_with_trace(cls_or_func):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment