@@ -162,7 +168,7 @@ class AMCTaskGenerator(TaskGenerator):
classAMCPruner(IterativePruner):
r"""
__doc__=r"""
AMC pruner leverages reinforcement learning to provide the model compression policy.
According to the author, this learning-based compression policy outperforms conventional rule-based compression policy by having a higher compression ratio,
better preserving the accuracy and freeing human labor.
...
...
@@ -186,10 +192,11 @@ class AMCPruner(IterativePruner):
- op_names : Operation name to be pruned.
- op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
- exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
dummy_input : torch.Tensor
`dummy_input` is required for speedup and tracing the model in RL environment.
evaluator : Callable[[Module], float]
Evaluate the pruned model and give a score.
evaluator
``evaluator`` is used to replace the previous ``finetuner``, ``dummy_input`` and old ``evaluator`` API.
{evaluator_docstring}
The old API (``finetuner``, ``dummy_input`` and old ``evaluator``) is still supported and will be deprecated in v3.0.
If you want to consult the old API, please refer to `v2.8 pruner API <https://nni.readthedocs.io/en/v2.8/reference/compression/pruner.html>`__.
_logger.warning('Only `total_sparsity` can be differentially allocated sparse ratio to each layer, `sparsity` or `sparsity_per_layer` will allocate fixed sparse ratio to layers. Make sure you know what this will lead to, otherwise please use `total_sparsity`.')
warn_msg='Only `total_sparsity` can be differentially allocated sparse ratio to each layer, '+ \
'`sparsity` or `sparsity_per_layer` will allocate fixed sparse ratio to layers. '+ \
'Make sure you know what this will lead to, otherwise please use `total_sparsity`.'
@@ -53,8 +60,9 @@ class AutoCompressTaskGenerator(LotteryTicketTaskGenerator):
classAutoCompressPruner(IterativePruner):
r"""
__doc__=r"""
For total iteration number :math:`N`, AutoCompressPruner prune the model that survive the previous iteration for a fixed sparsity ratio (e.g., :math:`1-{(1-0.8)}^{(1/N)}`) to achieve the overall sparsity (e.g., :math:`0.8`):
"""+r"""
.. code-block:: bash
...
...
@@ -65,35 +73,27 @@ class AutoCompressPruner(IterativePruner):
assertself.bound_modelisnotNoneandself.config_listisnotNone,'Model and/or config_list are not set in this pruner, please set them by reset() before compress().'
err_msg='Model and/or config_list are not set in this pruner, please set them by reset() before compress().'
This is a basic pruner, and in some papers called it magnitude pruning or fine-grained pruning.
...
...
@@ -133,9 +213,9 @@ class LevelPruner(BasicPruner):
Parameters
----------
model : torch.nn.Module
model
Model to be pruned.
config_list : List[Dict]
config_list
Supported keys:
- sparsity : This is to specify the sparsity for each layer in this config to be compressed.
- sparsity_per_layer : Equals to sparsity.
...
...
@@ -143,7 +223,7 @@ class LevelPruner(BasicPruner):
- op_names : Operation names to be pruned.
- op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
- exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
mode : str
mode
'normal' or 'balance'.
If setting 'normal' mode, target tensor will be pruned in the way of finegrained pruning.
If setting 'balance' mode, a specal sparse pattern will chosen by pruner. Take linear
...
...
@@ -152,7 +232,7 @@ class LevelPruner(BasicPruner):
pattern have more chance to achieve better trade-off between model performance and hardware
acceleration. Please refer to releated paper for further information `Balanced Sparsity for
Efficient DNN Inference on GPU <https://arxiv.org/pdf/1811.00206.pdf>`__.
balance_gran : list
balance_gran
Balance_gran is for special sparse pattern balanced sparsity, Default value is None which means pruning
without awaring balance, namely normal finegrained pruning.
If passing list of int, LevelPruner will prune the model in the granularity of multi-dimension block.
...
...
@@ -195,7 +275,8 @@ class LevelPruner(BasicPruner):
>>> pruner = LevelPruner(model, config_list)
>>> masked_model, masks = pruner.compress()
For detailed example please refer to :githublink:`examples/model_compress/pruning/level_pruning_torch.py <examples/model_compress/pruning/level_pruning_torch.py>`
@@ -298,9 +380,9 @@ class L1NormPruner(NormPruner):
Parameters
----------
model : torch.nn.Module
model
Model to be pruned.
config_list : List[Dict]
config_list
Supported keys:
- sparsity : This is to specify the sparsity for each layer in this config to be compressed.
- sparsity_per_layer : Equals to sparsity.
...
...
@@ -308,7 +390,7 @@ class L1NormPruner(NormPruner):
- op_names : Operation names to be pruned.
- op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
- exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
mode : str
mode
'normal' or 'dependency_aware'.
If prune the model in a dependency-aware way, this pruner will
prune the model according to the l1-norm of weights and the channel-dependency or
...
...
@@ -317,7 +399,7 @@ class L1NormPruner(NormPruner):
harvest the speed benefit from the pruned model. Note that, if set 'dependency_aware'
, the dummy_input cannot be None, because the pruner needs a dummy input to trace the
dependency between the conv layers.
dummy_input : Optional[torch.Tensor]
dummy_input
The dummy input to analyze the topology constraints. Note that, the dummy_input
should on the same device with the model.
"""
...
...
@@ -330,15 +412,16 @@ class L1NormPruner(NormPruner):
classL2NormPruner(NormPruner):
r"""
L2 norm pruner is a variant of L1 norm pruner.
The only different between L2 norm pruner and L1 norm pruner is L2 norm pruner prunes the weight with the smallest L2 norm of the weights.
The only different between L2 norm pruner and L1 norm pruner is
L2 norm pruner prunes the weight with the smallest L2 norm of the weights.
L2 norm pruner also supports dependency-aware mode.
Parameters
----------
model : torch.nn.Module
model
Model to be pruned.
config_list : List[Dict]
config_list
Supported keys:
- sparsity : This is to specify the sparsity for each layer in this config to be compressed.
- sparsity_per_layer : Equals to sparsity.
...
...
@@ -346,7 +429,7 @@ class L2NormPruner(NormPruner):
- op_names : Operation names to be pruned.
- op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
- exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
mode : str
mode
'normal' or 'dependency_aware'.
If prune the model in a dependency-aware way, this pruner will
prune the model according to the l2-norm of weights and the channel-dependency or
...
...
@@ -355,7 +438,7 @@ class L2NormPruner(NormPruner):
harvest the speed benefit from the pruned model. Note that, if set 'dependency_aware'
, the dummy_input cannot be None, because the pruner needs a dummy input to trace the
dependency between the conv layers.
dummy_input : Optional[torch.Tensor]
dummy_input
The dummy input to analyze the topology constraints. Note that, the dummy_input
should on the same device with the model.
...
...
@@ -367,7 +450,8 @@ class L2NormPruner(NormPruner):
>>> pruner = L2NormPruner(model, config_list)
>>> masked_model, masks = pruner.compress()
For detailed example please refer to :githublink:`examples/model_compress/pruning/norm_pruning_torch.py <examples/model_compress/pruning/norm_pruning_torch.py>`
@@ -380,15 +464,16 @@ class FPGMPruner(BasicPruner):
FPGM pruner prunes the blocks of the weight on the first dimension with the smallest geometric median.
FPGM chooses the weight blocks with the most replaceable contribution.
For more details, please refer to `Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration <https://arxiv.org/abs/1811.00250>`__.
For more details, please refer to
`Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration <https://arxiv.org/abs/1811.00250>`__.
FPGM pruner also supports dependency-aware mode.
Parameters
----------
model : torch.nn.Module
model
Model to be pruned.
config_list : List[Dict]
config_list
Supported keys:
- sparsity : This is to specify the sparsity for each layer in this config to be compressed.
- sparsity_per_layer : Equals to sparsity.
...
...
@@ -396,7 +481,7 @@ class FPGMPruner(BasicPruner):
- op_names : Operation names to be pruned.
- op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
- exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
mode : str
mode
'normal' or 'dependency_aware'.
If prune the model in a dependency-aware way, this pruner will
prune the model according to the FPGM of weights and the channel-dependency or
...
...
@@ -405,7 +490,7 @@ class FPGMPruner(BasicPruner):
harvest the speed benefit from the pruned model. Note that, if set 'dependency_aware'
, the dummy_input cannot be None, because the pruner needs a dummy input to trace the
dependency between the conv layers.
dummy_input : Optional[torch.Tensor]
dummy_input
The dummy input to analyze the topology constraints. Note that, the dummy_input
should on the same device with the model.
...
...
@@ -417,7 +502,8 @@ class FPGMPruner(BasicPruner):
>>> pruner = FPGMPruner(model, config_list)
>>> masked_model, masks = pruner.compress()
For detailed example please refer to :githublink:`examples/model_compress/pruning/fpgm_pruning_torch.py <examples/model_compress/pruning/fpgm_pruning_torch.py>`
For detailed example please refer to :githublink:`examples/model_compress/pruning/slim_pruning_torch.py <examples/model_compress/pruning/slim_pruning_torch.py>`
@@ -549,26 +613,48 @@ class SlimPruner(BasicPruner):
schema.validate(config_list)
exceptSchemaErrorase:
if"Missing key: 'total_sparsity'"instr(e):
_logger.error('`config_list` validation failed. If global mode is set in this pruner, `sparsity_per_layer` and `sparsity` are not supported, make sure `total_sparsity` is set in config_list.')
err_msg='`config_list` validation failed. If global mode is set in this pruner, '+ \
'`sparsity_per_layer` and `sparsity` are not supported, make sure `total_sparsity` is set in config_list.'
Activation APoZ rank pruner is a pruner which prunes on the first weight dimension,
__doc__=r"""Activation APoZ rank pruner is a pruner which prunes on the first weight dimension,
with the smallest importance criterion ``APoZ`` calculated from the output activations of convolution layers to achieve a preset level of network sparsity.
The pruning criterion ``APoZ`` is explained in the paper `Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures <https://arxiv.org/abs/1607.03250>`__.
For detailed example please refer to :githublink:`examples/model_compress/pruning/activation_pruning_torch.py <examples/model_compress/pruning/activation_pruning_torch.py>`
Activation mean rank pruner is a pruner which prunes on the first weight dimension,
with the smallest importance criterion ``mean activation`` calculated from the output activations of convolution layers to achieve a preset level of network sparsity.
...
...
@@ -797,9 +865,9 @@ class ActivationMeanRankPruner(ActivationPruner):
Parameters
----------
model : torch.nn.Module
model
Model to be pruned.
config_list : List[Dict]
config_list
Supported keys:
- sparsity : This is to specify the sparsity for each layer in this config to be compressed.
- sparsity_per_layer : Equals to sparsity.
...
...
@@ -807,33 +875,15 @@ class ActivationMeanRankPruner(ActivationPruner):
- op_names : Operation names to be pruned.
- op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
- exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
For detailed example please refer to :githublink:`examples/model_compress/pruning/activation_pruning_torch.py <examples/model_compress/pruning/activation_pruning_torch.py>`
Taylor FO weight pruner is a pruner which prunes on the first weight dimension,
based on estimated importance calculated from the first order taylor expansion on weights to achieve a preset level of network sparsity.
The estimated importance is defined as the paper `Importance Estimation for Neural Network Pruning <http://jankautz.com/publications/Importance4NNPruning_CVPR19.pdf>`__.
For detailed example please refer to :githublink:`examples/model_compress/pruning/taylorfo_pruning_torch.py <examples/model_compress/pruning/taylorfo_pruning_torch.py>`
@@ -983,16 +1004,19 @@ class TaylorFOWeightPruner(BasicPruner):
schema.validate(config_list)
exceptSchemaErrorase:
if"Missing key: 'total_sparsity'"instr(e):
_logger.error('`config_list` validation failed. If global mode is set in this pruner, `sparsity_per_layer` and `sparsity` are not supported, make sure `total_sparsity` is set in config_list.')
err_msg='`config_list` validation failed. If global mode is set in this pruner, '+ \
'`sparsity_per_layer` and `sparsity` are not supported, make sure `total_sparsity` is set in config_list.'
For detailed example please refer to :githublink:`examples/model_compress/pruning/admm_pruning_torch.py <examples/model_compress/pruning/admm_pruning_torch.py>`
@@ -71,55 +77,67 @@ class IterativePruner(PruningScheduler):
classLinearPruner(IterativePruner):
r"""
__doc__=r"""
Linear pruner is an iterative pruner, it will increase sparsity evenly from scratch during each iteration.
For example, the final sparsity is set as 0.5, and the iteration number is 5, then the sparsity used in each iteration are ``[0, 0.1, 0.2, 0.3, 0.4, 0.5]``.
For detailed example please refer to :githublink:`examples/model_compress/pruning/iterative_pruning_torch.py <examples/model_compress/pruning/iterative_pruning_torch.py>`
This is an iterative pruner, which the sparsity is increased from an initial sparsity value :math:`s_{i}` (usually 0) to a final sparsity value :math:`s_{f}` over a span of :math:`n` pruning iterations,
starting at training step :math:`t_{0}` and with pruning frequency :math:`\Delta t`:
:math:`s_{t}=s_{f}+\left(s_{i}-s_{f}\right)\left(1-\frac{t-t_{0}}{n \Delta t}\right)^{3} \text { for } t \in\left\{t_{0}, t_{0}+\Delta t, \ldots, t_{0} + n \Delta t\right\}`
"""+r"""
For more details please refer to `To prune, or not to prune: exploring the efficacy of pruning for model compression <https://arxiv.org/abs/1710.01878>`__\.
For detailed example please refer to :githublink:`examples/model_compress/pruning/iterative_pruning_torch.py <examples/model_compress/pruning/iterative_pruning_torch.py>`
For detailed example please refer to :githublink:`examples/model_compress/pruning/iterative_pruning_torch.py <examples/model_compress/pruning/iterative_pruning_torch.py>`
We implement a guided heuristic search method, Simulated Annealing (SA) algorithm. As mentioned in the paper, this method is enhanced on guided search based on prior experience.
The enhanced SA technique is based on the observation that a DNN layer with more number of weights often has a higher degree of model compression with less impact on overall accuracy.
...
...
@@ -294,54 +351,81 @@ class SimulatedAnnealingPruner(IterativePruner):
Parameters
----------
model : Optional[Module]
model
The origin unwrapped pytorch model to be pruned.
config_list : Optional[List[Dict]]
config_list
The origin config list provided by the user.
evaluator : Callable[[Module], float]
Evaluate the pruned model and give a score.
start_temperature : float
evaluator
``evaluator`` is used to replace the previous ``finetuner``, ``dummy_input`` and old ``evaluator`` API.
{evaluator_docstring}
The old API (``finetuner``, ``dummy_input`` and old ``evaluator``) is still supported and will be deprecated in v3.0.
If you want to consult the old API, please refer to `v2.8 pruner API <https://nni.readthedocs.io/en/v2.8/reference/compression/pruner.html>`__.
start_temperature
Start temperature of the simulated annealing process.
stop_temperature : float
stop_temperature
Stop temperature of the simulated annealing process.
cool_down_rate : float
cool_down_rate
Cool down rate of the temperature.
perturbation_magnitude : float
perturbation_magnitude
Initial perturbation magnitude to the sparsities. The magnitude decreases with current temperature.
For detailed example please refer to :githublink:`examples/model_compress/pruning/simulated_anealing_pruning_torch.py <examples/model_compress/pruning/simulated_anealing_pruning_torch.py>`
For detailed example please refer to :githublink:`examples/model_compress/pruning/movement_pruning_glue.py <examples/model_compress/pruning/movement_pruning_glue.py>`
_logger.debug('\nTask %s total real sparsity compared with original model is:\n%s',str(task_result.task_id),json_tricks.dumps(current2origin_sparsity,indent=4))
_logger.warning('Only `total_sparsity` can be differentially allocated sparse ratio to each layer, `sparsity` or `sparsity_per_layer` will allocate fixed sparse ratio to layers. Make sure you know what this will lead to, otherwise please use `total_sparsity`.')
warn_msg='Only `total_sparsity` can be differentially allocated sparse ratio to each layer, '+ \
'`sparsity` or `sparsity_per_layer` will allocate fixed sparse ratio to layers. '+ \
'Make sure you know what this will lead to, otherwise please use `total_sparsity`.'