fix activation collection and add gradient pruners (#2187)

b8d19e45 · colorjam · GitHub · 4e2c0aad · b8d19e45 · b8d19e45
Unverified Commit b8d19e45 authored Mar 26, 2020 by colorjam Committed by GitHub Mar 26, 2020
8 changed files
--- a/docs/en_US/Compressor/Pruner.md
+++ b/docs/en_US/Compressor/Pruner.md
@@ -13,6 +13,8 @@ Index of supported pruning algorithms
 * [Filter Pruners with Activation Rank](#activationrankfilterpruner)
    * [APoZ Rank Pruner](#activationapozrankfilterpruner)
    * [Activation Mean Rank Pruner](#activationmeanrankfilterpruner)
+* [Filter Pruners with Gradient Rank](#gradientrankfilterpruner)
+    * [Taylor FO On Weight Pruner](#taylorfoweightfilterpruner)

 ## Level Pruner

@@ -281,7 +283,7 @@ pruner.compress()
 - **op_types:** Only Conv1d and Conv2d is supported in L2Filter Pruner

 ## ActivationRankFilterPruner
-ActivationRankFilterPruner is a series of pruners which prune the filters with the smallest importance criterion calculated from the output activations of convolution layers to achieve a preset level of network sparsity
+ActivationRankFilterPruner is a series of pruners which prune the filters with the smallest importance criterion calculated from the output activations of convolution layers to achieve a preset level of network sparsity.

 ### ActivationAPoZRankFilterPruner

@@ -341,4 +343,42 @@ You can view example for more information
 #### User configuration for ActivationMeanRankFilterPruner

 - **sparsity:** How much percentage of convolutional filters are to be pruned.
- **op_types:** Only Conv2d is supported in ActivationMeanRankFilterPruner
+- **op_types:** Only Conv2d is supported in ActivationMeanRankFilterPruner.
+
+
+## GradientRankFilterPruner
+
+GradientRankFilterPruner is a series of pruners which prune the filters with the smallest importance criterion calculated from the gradients of convolution layers to achieve a preset level of network sparsity.
+
+### TaylorFOWeightFilterPruner
+
+We implemented it as a one-shot pruner, it prunes convolutional layers based on the first order taylor expansion on weights. The estimated importance of filters is defined as the paper [Importance Estimation for Neural Network Pruning](http://jankautz.com/publications/Importance4NNPruning_CVPR19.pdf). Other pruning criteria mentioned in this paper will be supported in future release.
+
+> 
+
+![](../../img/importance_estimation_sum.png)
+
+#### Usage
+
+PyTorch code
+
+```python
+from nni.compression.torch import TaylorFOWeightFilterPruner
+config_list = [{
+    'sparsity': 0.5,
+    'op_types': ['Conv2d']
+}]
+pruner = TaylorFOWeightFilterPruner(model, config_list, optimizer)
+pruner.compress()
+```
+
+You can view example for more information
+
+#### User configuration for GradientWeightSumFilterPruner
+
+- **sparsity:** How much percentage of convolutional filters are to be pruned.
+- **op_types:** Currently only Conv2d is supported in TaylorFOWeightFilterPruner.
+
+
+
+ 
\ No newline at end of file
--- a/docs/img/importance_estimation_sum.png
+++ b/docs/img/importance_estimation_sum.png
--- a/src/sdk/pynni/nni/compression/torch/__init__.py
+++ b/src/sdk/pynni/nni/compression/torch/__init__.py
@@ -7,3 +7,4 @@ from .weight_rank_filter_pruners import *
 from .activation_rank_filter_pruners import *
 from .quantizers import *
 from .apply_compression import apply_compression_results
+from .gradient_rank_filter_pruners import *
--- a/src/sdk/pynni/nni/compression/torch/activation_rank_filter_pruners.py
+++ b/src/sdk/pynni/nni/compression/torch/activation_rank_filter_pruners.py
@@ -37,13 +37,9 @@ class ActivationRankFilterPruner(Pruner):

        super().__init__(model, config_list, optimizer)
        self.set_wrappers_attribute("if_calculated", False)
-        self.set_wrappers_attribute("collected_activation", [])
        self.statistics_batch_num = statistics_batch_num
+        self.hook_id = self._add_activation_collector()

-        def collector(module_, input_, output):
-            if len(module_.collected_activation) < self.statistics_batch_num:
-                module_.collected_activation.append(self.activation(output.detach().cpu()))
-        self.add_activation_collector(collector)
        assert activation in ['relu', 'relu6']
        if activation == 'relu':
            self.activation = torch.nn.functional.relu
@@ -52,6 +48,21 @@ class ActivationRankFilterPruner(Pruner):
        else:
            self.activation = None

+    def _add_activation_collector(self):
+        def collector(collected_activation):
+            def hook(module_, input_, output):
+                collected_activation.append(self.activation(output.detach().cpu()))
+            return hook
+        self.collected_activation = {}
+        self._fwd_hook_id += 1
+        self._fwd_hook_handles[self._fwd_hook_id] = []
+
+        for wrapper_idx, wrapper in enumerate(self.get_modules_wrapper()):
+            self.collected_activation[wrapper_idx] = []
+            handle = wrapper.register_forward_hook(collector(self.collected_activation[wrapper_idx]))
+            self._fwd_hook_handles[self._fwd_hook_id].append(handle)
+        return self._fwd_hook_id
+
    def validate_config(self, model, config_list):
        """
        Parameters
@@ -73,24 +84,21 @@ class ActivationRankFilterPruner(Pruner):
    def get_mask(self, base_mask, activations, num_prune):
        raise NotImplementedError('{} get_mask is not implemented'.format(self.__class__.__name__))

-    def calc_mask(self, wrapper, **kwargs):
+    def calc_mask(self, wrapper, wrapper_idx, **kwargs):
        """
        Calculate the mask of given layer.
        Filters with the smallest importance criterion which is calculated from the activation are masked.

        Parameters
        ----------
-        layer : LayerInfo
+        wrapper : Module
            the layer to instrument the compression operation
-        config : dict
-            layer's pruning config

        Returns
        -------
        dict
            dictionary for storing masks
        """
-
        weight = wrapper.module.weight.data
        op_type = wrapper.type
        config = wrapper.config
@@ -100,21 +108,27 @@ class ActivationRankFilterPruner(Pruner):

        if wrapper.if_calculated:
            return None
+
        mask_weight = torch.ones(weight.size()).type_as(weight).detach()
        if hasattr(wrapper.module, 'bias') and wrapper.module.bias is not None:
            mask_bias = torch.ones(wrapper.module.bias.size()).type_as(wrapper.module.bias).detach()
        else:
            mask_bias = None
        mask = {'weight_mask': mask_weight, 'bias_mask': mask_bias}
+
        try:
            filters = weight.size(0)
            num_prune = int(filters * config.get('sparsity'))
-            if filters < 2 or num_prune < 1 or len(wrapper.collected_activation) < self.statistics_batch_num:
+            acts = self.collected_activation[wrapper_idx]
+            if filters < 2 or num_prune < 1 or len(acts) < self.statistics_batch_num:
                return mask
-            mask = self.get_mask(mask, wrapper.collected_activation, num_prune)
+            mask = self.get_mask(mask, acts, num_prune)
        finally:
-            if len(wrapper.collected_activation) == self.statistics_batch_num:
+            if len(acts) >= self.statistics_batch_num:
                wrapper.if_calculated = True
+                if self.hook_id in self._fwd_hook_handles:
+                    self.remove_activation_collector(self.hook_id)
+
        return mask


@@ -148,7 +162,7 @@ class ActivationAPoZRankFilterPruner(ActivationRankFilterPruner):
    def get_mask(self, base_mask, activations, num_prune):
        """
        Calculate the mask of given layer.
-        Filters with the smallest APoZ(average percentage of zeros) of output activations are masked.
+        Filters with the largest APoZ(average percentage of zeros) of output activations are masked.

        Parameters
        ----------

--- a/src/sdk/pynni/nni/compression/torch/compressor.py
+++ b/src/sdk/pynni/nni/compression/torch/compressor.py
@@ -314,8 +314,8 @@ class Pruner(Compressor):
        return self.bound_model

    def update_mask(self):
-        for wrapper in self.get_modules_wrapper():
-            masks = self.calc_mask(wrapper)
+        for wrapper_idx, wrapper in enumerate(self.get_modules_wrapper()):
+            masks = self.calc_mask(wrapper, wrapper_idx=wrapper_idx)
            if masks is not None:
                for k in masks:
                    assert hasattr(wrapper, k), "there is no attribute '%s' in wrapper" % k

--- a/src/sdk/pynni/nni/compression/torch/gradient_rank_filter_pruners.py
+++ b/src/sdk/pynni/nni/compression/torch/gradient_rank_filter_pruners.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import torch
+from .compressor import Pruner
+
+__all__ = ['TaylorFOWeightFilterPruner']
+
+logger = logging.getLogger('torch gradient rank filter pruners')
+
+class GradientRankFilterPruner(Pruner):
+    """
+    A structured pruning base class that prunes the filters with the smallest
+    importance criterion in convolution layers (using gradient values)
+    to achieve a preset level of network sparsity.
+    """
+
+    def __init__(self, model, config_list, optimizer, statistics_batch_num=1):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        optimizer: torch.optim.Optimizer
+            Optimizer used to train model
+        statistics_batch_num : int
+            Num of batches for calculating contribution
+        """
+
+        super().__init__(model, config_list, optimizer)
+        self.set_wrappers_attribute("if_calculated", False)
+        self.set_wrappers_attribute("contribution", None)
+        self.statistics_batch_num = statistics_batch_num
+        self.iterations = 0
+        self.old_step = self.optimizer.step
+        self.patch_optimizer(self.calc_contributions)
+
+    def calc_contributions(self):
+        raise NotImplementedError('{} calc_contributions is not implemented'.format(self.__class__.__name__))
+
+    def get_mask(self, base_mask, contribution, num_prune):
+        raise NotImplementedError('{} get_mask is not implemented'.format(self.__class__.__name__))
+
+    def calc_mask(self, wrapper, **kwargs):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest importance criterion which is calculated from the activation are masked.
+
+        Parameters
+        ----------
+        wrapper : Module
+            the layer to instrument the compression operation
+
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+
+        weight = wrapper.module.weight.data
+        op_type = wrapper.type
+        config = wrapper.config
+        assert 0 <= config.get('sparsity') < 1, "sparsity must in the range [0, 1)"
+        assert op_type in config.get('op_types')
+
+        if wrapper.if_calculated:
+            return None
+
+        mask_weight = torch.ones(weight.size()).type_as(weight).detach()
+        if hasattr(wrapper.module, 'bias') and wrapper.module.bias is not None:
+            mask_bias = torch.ones(wrapper.module.bias.size()).type_as(wrapper.module.bias).detach()
+        else:
+            mask_bias = None
+        mask = {'weight_mask': mask_weight, 'bias_mask': mask_bias}
+        try:
+            filters = weight.size(0)
+            num_prune = int(filters * config.get('sparsity'))
+            if filters < 2 or num_prune < 1 or self.iterations < self.statistics_batch_num:
+                return mask
+
+            mask = self.get_mask(mask, wrapper.contribution, num_prune)
+        finally:
+            if self.iterations >= self.statistics_batch_num:
+                wrapper.if_calculated = True
+
+        return mask
+
+
+class TaylorFOWeightFilterPruner(GradientRankFilterPruner):
+    """
+    A structured pruning algorithm that prunes the filters with the smallest
+    importance approximations based on the first order taylor expansion on the weight.
+    Molchanov, Pavlo and Mallya, Arun and Tyree, Stephen and Frosio, Iuri and Kautz, Jan,
+    "Importance Estimation for Neural Network Pruning", CVPR 2019.
+    http://jankautz.com/publications/Importance4NNPruning_CVPR19.pdf
+    """
+
+    def __init__(self, model, config_list, optimizer, statistics_batch_num=1):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        optimizer: torch.optim.Optimizer
+            Optimizer used to train model
+        statistics_batch_num : int
+            Num of batches for activation statistics
+        """
+        super().__init__(model, config_list, optimizer, statistics_batch_num)
+
+    def get_mask(self, base_mask, contribution, num_prune):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest importance approximations are masked.
+
+        Parameters
+        ----------
+        base_mask : dict
+            The basic mask with the same shape of weight, all item in the basic mask is 1.
+        contribution : torch.Tensor
+            Layer's importance approximations
+        num_prune : int
+            Num of filters to prune
+
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+        prune_indices = torch.argsort(contribution)[:num_prune]
+        for idx in prune_indices:
+            base_mask['weight_mask'][idx] = 0.
+            if base_mask['bias_mask'] is not None:
+                base_mask['bias_mask'][idx] = 0.
+        return base_mask
+
+    def calc_contributions(self):
+        """
+        Calculate the estimated importance of filters as a sum of individual contribution
+        based on the first order taylor expansion.
+        """
+
+        if self.iterations >= self.statistics_batch_num:
+            return
+        for wrapper in self.get_modules_wrapper():
+            filters = wrapper.module.weight.size(0)
+            contribution = (wrapper.module.weight*wrapper.module.weight.grad).data.pow(2).view(filters, -1).sum(dim=1)
+            if wrapper.contribution is None:
+                wrapper.contribution = contribution
+            else:
+                wrapper.contribution += contribution
+
+        self.iterations += 1
--- a/src/sdk/pynni/nni/compression/torch/weight_rank_filter_pruners.py
+++ b/src/sdk/pynni/nni/compression/torch/weight_rank_filter_pruners.py
@@ -60,10 +60,8 @@ class WeightRankFilterPruner(Pruner):
        Filters with the smallest importance criterion of the kernel weights are masked.
        Parameters
        ----------
-        layer : LayerInfo
-            the layer to instrument the compression operation
-        config : dict
-            layer's pruning config
+        wrapper : Module
+            the module to instrument the compression operation
        Returns
        -------
        dict

--- a/src/sdk/pynni/tests/test_compressor.py
+++ b/src/sdk/pynni/tests/test_compressor.py
@@ -228,6 +228,52 @@ class CompressorTestCase(TestCase):
        assert all(mask1['bias_mask'].numpy() == np.array([0., 0., 0., 1., 1.]))
        assert all(mask2['bias_mask'].numpy() == np.array([0., 0., 0., 1., 1.]))

+    def test_torch_taylorFOweight_pruner(self):
+        """
+        Filters with the minimum importance approxiamtion based on the first order 
+        taylor expansion on the weights (w*grad)**2 are pruned in this paper:
+        Importance Estimation for Neural Network Pruning,
+        http://jankautz.com/publications/Importance4NNPruning_CVPR19.pdf
+
+        So if sparsity of conv1 is 0.2, the expected masks should mask out filter 0, this can be verified through:
+        `all(torch.sum(mask1['weight_mask'], (1, 2, 3)).numpy() == np.array([0., 25., 25., 25., 25.]))`
+
+        If sparsity of conv2 is 0.6, the expected masks should mask out filter 4,5,6,7,8,9 this can be verified through:
+        `all(torch.sum(mask2['weight_mask'], (1, 2, 3)).numpy() == np.array([125., 125., 125., 125., 0., 0., 0., 0., 0., 0., ]))`
+        """
+
+        w1 = np.array([np.zeros((1, 5, 5)), np.ones((1, 5, 5)), np.ones((1, 5, 5)) * 2,
+                      np.ones((1, 5, 5)) * 3, np.ones((1, 5, 5)) * 4])
+        w2 = np.array([[[[i + 1] * 5] * 5] * 5 for i in range(10)[::-1]])
+
+        grad1 = np.array([np.ones((1, 5, 5)) * -1, np.ones((1, 5, 5)) * 1, np.ones((1, 5, 5)) * -1,
+                      np.ones((1, 5, 5)) * 1, np.ones((1, 5, 5)) * -1])
+
+        grad2 = np.array([[[[(-1)**i] * 5] * 5] * 5 for i in range(10)])
+
+        config_list = [{'sparsity': 0.2, 'op_types': ['Conv2d'], 'op_names': ['conv1']},
+                       {'sparsity': 0.6, 'op_types': ['Conv2d'], 'op_names': ['conv2']}]
+
+        model = TorchModel()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+        pruner = torch_compressor.TaylorFOWeightFilterPruner(model, config_list, optimizer, statistics_batch_num=1)
+        
+        x = torch.rand((1, 1, 28, 28), requires_grad=True)
+        model.conv1.module.weight.data = torch.tensor(w1).float()
+        model.conv2.module.weight.data = torch.tensor(w2).float()
+
+        y = model(x)
+        y.backward(torch.ones_like(y))
+
+        model.conv1.module.weight.grad.data = torch.tensor(grad1).float()
+        model.conv2.module.weight.grad.data = torch.tensor(grad2).float()
+        optimizer.step()
+
+        mask1 = pruner.calc_mask(model.conv1)
+        mask2 = pruner.calc_mask(model.conv2)
+        assert all(torch.sum(mask1['weight_mask'], (1, 2, 3)).numpy() == np.array([0., 25., 25., 25., 25.]))
+        assert all(torch.sum(mask2['weight_mask'], (1, 2, 3)).numpy() == np.array([125., 125., 125., 125., 0., 0., 0., 0., 0., 0., ]))
+
    def test_torch_QAT_quantizer(self):
        model = TorchModel()
        config_list = [{