[Doc] Compression (#4574)

db3130d7 · J-shang · GitHub · cef9babd · db3130d7 · db3130d7
Unverified Commit db3130d7 authored Feb 28, 2022 by J-shang Committed by GitHub Feb 28, 2022
4 changed files
--- a/nni/algorithms/compression/v2/pytorch/pruning/auto_compress_pruner.py
+++ b/nni/algorithms/compression/v2/pytorch/pruning/auto_compress_pruner.py
@@ -51,7 +51,16 @@ class AutoCompressTaskGenerator(LotteryTicketTaskGenerator):
 class AutoCompressPruner(IterativePruner):
-    """
+    r"""
+    For total iteration number :math:`N`, AutoCompressPruner prune the model that survive the previous iteration for a fixed sparsity ratio (e.g., :math:`1-{(1-0.8)}^{(1/N)}`) to achieve the overall sparsity (e.g., :math:`0.8`):
+    .. code-block:: bash
+        1. Generate sparsities distribution using SimulatedAnnealingPruner
+        2. Perform ADMM-based pruning to generate pruning result for the next iteration.
+    For more details, please refer to `AutoCompress: An Automatic DNN Structured Pruning Framework for Ultra-High Compression Rates <https://arxiv.org/abs/1907.03141>`__.
    Parameters
    ----------
    model : Module
@@ -70,7 +79,7 @@ class AutoCompressPruner(IterativePruner):
            The model will be trained or inferenced `training_epochs` epochs.
        - traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
            The traced optimizer instance which the optimizer class is wrapped by nni.trace.
-            E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()).
+            E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
        - criterion : Callable[[Tensor, Tensor], Tensor].
            The criterion function used in trainer. Take model output and target value as input, and return the loss.
        - iterations : int.
@@ -107,6 +116,34 @@ class AutoCompressPruner(IterativePruner):
        If set True, speed up the model at the end of each iteration to make the pruned model compact.
    dummy_input : Optional[torch.Tensor]
        If `speed_up` is True, `dummy_input` is required for tracing the model in speed up.
+    Examples
+    --------
+        >>> import nni
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import AutoCompressPruner
+        >>> model = ...
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> # make sure you have used nni.trace to wrap the optimizer class before initialize
+        >>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
+        >>> trainer = ...
+        >>> criterion = ...
+        >>> evaluator = ...
+        >>> finetuner = ...
+        >>> admm_params = {
+        >>>     'trainer': trainer,
+        >>>     'traced_optimizer': traced_optimizer,
+        >>>     'criterion': criterion,
+        >>>     'iterations': 10,
+        >>>     'training_epochs': 1
+        >>> }
+        >>> sa_params = {
+        >>>     'evaluator': evaluator
+        >>> }
+        >>> pruner = AutoCompressPruner(model, config_list, 10, admm_params, sa_params, finetuner=finetuner)
+        >>> pruner.compress()
+        >>> _, model, masks, _, _ = pruner.get_best_result()
+    The full script can be found :githublink:`here <examples/model_compress/pruning/v2/auto_compress_pruner.py>`.
    """
    def __init__(self, model: Module, config_list: List[Dict], total_iteration: int, admm_params: Dict,

--- a/nni/algorithms/compression/v2/pytorch/pruning/basic_pruner.py
+++ b/nni/algorithms/compression/v2/pytorch/pruning/basic_pruner.py
@@ -125,11 +125,14 @@ class BasicPruner(Pruner):
 class LevelPruner(BasicPruner):
-    """
+    r"""
+    This is a basic pruner, and in some papers called it magnitude pruning or fine-grained pruning.
+    It will mask the smallest magnitude weights in each specified layer by a saprsity ratio configured in the config list.
    Parameters
    ----------
    model : torch.nn.Module
-        Model to be pruned
+        Model to be pruned.
    config_list : List[Dict]
        Supported keys:
            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
@@ -181,7 +184,16 @@ class LevelPruner(BasicPruner):
                pruning result: Weight tensor whose shape is [64, 64] will be split into 4 [32, 32] sub blocks.
                                Each sub block will be pruned 256 values.
+    Examples
+    --------
+        >>> model = ...
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import LevelPruner
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['default'] }]
+        >>> pruner = LevelPruner(model, config_list)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/level_pruning_torch.py <examples/model_compress/pruning/v2/level_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], mode: str = "normal", balance_gran: Optional[List] = None):
@@ -215,7 +227,7 @@ class NormPruner(BasicPruner):
    Parameters
    ----------
    model : torch.nn.Module
-        Model to be pruned
+        Model to be pruned.
    config_list : List[Dict]
        Supported keys:
            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
@@ -272,11 +284,20 @@ class NormPruner(BasicPruner):
 class L1NormPruner(NormPruner):
-    """
+    r"""
+    L1 norm pruner computes the l1 norm of the layer weight on the first dimension,
+    then prune the weight blocks on this dimension with smaller l1 norm values.
+    i.e., compute the l1 norm of the filters in convolution layer as metric values,
+    compute the l1 norm of the weight by rows in linear layer as metric values.
+    For more details, please refer to `PRUNING FILTERS FOR EFFICIENT CONVNETS <https://arxiv.org/abs/1608.08710>`__\.
+    In addition, L1 norm pruner also supports dependency-aware mode.
    Parameters
    ----------
    model : torch.nn.Module
-        Model to be pruned
+        Model to be pruned.
    config_list : List[Dict]
        Supported keys:
            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
@@ -305,16 +326,21 @@ class L1NormPruner(NormPruner):
 class L2NormPruner(NormPruner):
-    """
+    r"""
+    L2 norm pruner is a variant of L1 norm pruner.
+    The only different between L2 norm pruner and L1 norm pruner is L2 norm pruner prunes the weight with the smallest L2 norm of the weights.
+    L2 norm pruner also supports dependency-aware mode.
    Parameters
    ----------
    model : torch.nn.Module
-        Model to be pruned
+        Model to be pruned.
    config_list : List[Dict]
        Supported keys:
            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
            - sparsity_per_layer : Equals to sparsity.
-            - op_types : Conv2d and Linear are supported in L1NormPruner.
+            - op_types : Conv2d and Linear are supported in L2NormPruner.
            - op_names : Operation names to be pruned.
            - op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
            - exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
@@ -330,6 +356,16 @@ class L2NormPruner(NormPruner):
    dummy_input : Optional[torch.Tensor]
        The dummy input to analyze the topology constraints. Note that, the dummy_input
        should on the same device with the model.
+    Examples
+    --------
+        >>> model = ...
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import L2NormPruner
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> pruner = L2NormPruner(model, config_list)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/norm_pruning_torch.py <examples/model_compress/pruning/v2/norm_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict],
@@ -338,11 +374,18 @@ class L2NormPruner(NormPruner):
 class FPGMPruner(BasicPruner):
-    """
+    r"""
+    FPGM pruner prunes the blocks of the weight on the first dimension with the smallest geometric median.
+    FPGM chooses the weight blocks with the most replaceable contribution.
+    For more details, please refer to `Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration <https://arxiv.org/abs/1811.00250>`__.
+    FPGM pruner also supports dependency-aware mode.
    Parameters
    ----------
    model : torch.nn.Module
-        Model to be pruned
+        Model to be pruned.
    config_list : List[Dict]
        Supported keys:
            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
@@ -363,6 +406,16 @@ class FPGMPruner(BasicPruner):
    dummy_input : Optional[torch.Tensor]
        The dummy input to analyze the topology constraints. Note that, the dummy_input
        should on the same device with the model.
+    Examples
+    --------
+        >>> model = ...
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import FPGMPruner
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> pruner = FPGMPruner(model, config_list)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/fpgm_pruning_torch.py <examples/model_compress/pruning/v2/fpgm_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict],
@@ -396,11 +449,16 @@ class FPGMPruner(BasicPruner):
 class SlimPruner(BasicPruner):
-    """
+    r"""
+    Slim pruner adds sparsity regularization on the scaling factors of batch normalization (BN) layers during training to identify unimportant channels.
+    The channels with small scaling factor values will be pruned.
+    For more details, please refer to `Learning Efficient Convolutional Networks through Network Slimming <https://arxiv.org/abs/1708.06519>`__\.
    Parameters
    ----------
    model : torch.nn.Module
-        Model to be pruned
+        Model to be pruned.
    config_list : List[Dict]
        Supported keys:
            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
@@ -432,7 +490,7 @@ class SlimPruner(BasicPruner):
                model.train(mode=training)
    traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
        The traced optimizer instance which the optimizer class is wrapped by nni.trace.
-        E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()).
+        E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
    criterion : Callable[[Tensor, Tensor], Tensor]
        The criterion function used in trainer. Take model output and target value as input, and return the loss.
    training_epochs : int
@@ -444,6 +502,21 @@ class SlimPruner(BasicPruner):
        If prune the model in a global way, all layer weights with same config will be considered uniformly.
        That means a single layer may not reach or exceed the sparsity setting in config,
        but the total pruned weights meet the sparsity setting.
+    Examples
+    --------
+        >>> import nni
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import SlimPruner
+        >>> model = ...
+        >>> # make sure you have used nni.trace to wrap the optimizer class before initialize
+        >>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
+        >>> trainer = ...
+        >>> criterion = ...
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['BatchNorm2d'] }]
+        >>> pruner = SlimPruner(model, config_list, trainer, traced_optimizer, criterion, training_epochs=1)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/slim_pruning_torch.py <examples/model_compress/pruning/v2/slim_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], trainer: Callable[[Module, Optimizer, Callable], None],
@@ -507,7 +580,7 @@ class ActivationPruner(BasicPruner):
    Parameters
    ----------
    model : torch.nn.Module
-        Model to be pruned
+        Model to be pruned.
    config_list : List[Dict]
        Supported keys:
            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
@@ -537,7 +610,7 @@ class ActivationPruner(BasicPruner):
                model.train(mode=training)
    traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
        The traced optimizer instance which the optimizer class is wrapped by nni.trace.
-        E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()).
+        E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
    criterion : Callable[[Tensor, Tensor], Tensor]
        The criterion function used in trainer. Take model output and target value as input, and return the loss.
    training_batches
@@ -627,6 +700,82 @@ class ActivationPruner(BasicPruner):
 class ActivationAPoZRankPruner(ActivationPruner):
+    r"""
+    Activation APoZ rank pruner is a pruner which prunes on the first weight dimension,
+    with the smallest importance criterion ``APoZ`` calculated from the output activations of convolution layers to achieve a preset level of network sparsity.
+    The pruning criterion ``APoZ`` is explained in the paper `Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures <https://arxiv.org/abs/1607.03250>`__.
+    The APoZ is defined as:
+    :math:`APoZ_{c}^{(i)} = APoZ\left(O_{c}^{(i)}\right)=\frac{\sum_{k}^{N} \sum_{j}^{M} f\left(O_{c, j}^{(i)}(k)=0\right)}{N \times M}`
+    Activation APoZ rank pruner also supports dependency-aware mode.
+    Parameters
+    ----------
+    model : torch.nn.Module
+        Model to be pruned.
+    config_list : List[Dict]
+        Supported keys:
+            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
+            - sparsity_per_layer : Equals to sparsity.
+            - op_types : Conv2d and Linear are supported in ActivationAPoZRankPruner.
+            - op_names : Operation names to be pruned.
+            - op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
+            - exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
+    trainer : Callable[[Module, Optimizer, Callable], None]
+        A callable function used to train model or just inference. Take model, optimizer, criterion as input.
+        The model will be trained or inferenced `training_epochs` epochs.
+        Example::
+            def trainer(model: Module, optimizer: Optimizer, criterion: Callable[[Tensor, Tensor], Tensor]):
+                training = model.training
+                model.train(mode=True)
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                for batch_idx, (data, target) in enumerate(train_loader):
+                    data, target = data.to(device), target.to(device)
+                    optimizer.zero_grad()
+                    output = model(data)
+                    loss = criterion(output, target)
+                    loss.backward()
+                    # If you don't want to update the model, you can skip `optimizer.step()`, and set train mode False.
+                    optimizer.step()
+                model.train(mode=training)
+    traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
+        The traced optimizer instance which the optimizer class is wrapped by nni.trace.
+        E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``..
+    criterion : Callable[[Tensor, Tensor], Tensor]
+        The criterion function used in trainer. Take model output and target value as input, and return the loss.
+    training_batches
+        The batch number used to collect activations.
+    mode : str
+        'normal' or 'dependency_aware'.
+        If prune the model in a dependency-aware way, this pruner will
+        prune the model according to the activation-based metrics and the channel-dependency or
+        group-dependency of the model. In this way, the pruner will force the conv layers
+        that have dependencies to prune the same channels, so the speedup module can better
+        harvest the speed benefit from the pruned model. Note that, if set 'dependency_aware'
+        , the dummy_input cannot be None, because the pruner needs a dummy input to trace the
+        dependency between the conv layers.
+    dummy_input : Optional[torch.Tensor]
+        The dummy input to analyze the topology constraints. Note that, the dummy_input
+        should on the same device with the model.
+    Examples
+    --------
+        >>> import nni
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import ActivationAPoZRankPruner
+        >>> model = ...
+        >>> # make sure you have used nni.trace to wrap the optimizer class before initialize
+        >>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
+        >>> trainer = ...
+        >>> criterion = ...
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> pruner = ActivationAPoZRankPruner(model, config_list, trainer, traced_optimizer, criterion, training_batches=20)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/activation_pruning_torch.py <examples/model_compress/pruning/v2/activation_pruning_torch.py>`
+    """
    def _activation_trans(self, output: Tensor) -> Tensor:
        # return a matrix that the position of zero in `output` is one, others is zero.
        return torch.eq(self._activation(output.detach()), torch.zeros_like(output)).type_as(output)
@@ -636,6 +785,80 @@ class ActivationAPoZRankPruner(ActivationPruner):
 class ActivationMeanRankPruner(ActivationPruner):
+    r"""
+    Activation mean rank pruner is a pruner which prunes on the first weight dimension,
+    with the smallest importance criterion ``mean activation`` calculated from the output activations of convolution layers to achieve a preset level of network sparsity.
+    The pruning criterion ``mean activation`` is explained in section 2.2 of the paper `Pruning Convolutional Neural Networks for Resource Efficient Inference <https://arxiv.org/abs/1611.06440>`__.
+    Activation mean rank pruner also supports dependency-aware mode.
+    Parameters
+    ----------
+    model : torch.nn.Module
+        Model to be pruned.
+    config_list : List[Dict]
+        Supported keys:
+            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
+            - sparsity_per_layer : Equals to sparsity.
+            - op_types : Conv2d and Linear are supported in ActivationPruner.
+            - op_names : Operation names to be pruned.
+            - op_partial_names: Operation partial names to be pruned, will be autocompleted by NNI.
+            - exclude : Set True then the layers setting by op_types and op_names will be excluded from pruning.
+    trainer : Callable[[Module, Optimizer, Callable], None]
+        A callable function used to train model or just inference. Take model, optimizer, criterion as input.
+        The model will be trained or inferenced `training_epochs` epochs.
+        Example::
+            def trainer(model: Module, optimizer: Optimizer, criterion: Callable[[Tensor, Tensor], Tensor]):
+                training = model.training
+                model.train(mode=True)
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                for batch_idx, (data, target) in enumerate(train_loader):
+                    data, target = data.to(device), target.to(device)
+                    optimizer.zero_grad()
+                    output = model(data)
+                    loss = criterion(output, target)
+                    loss.backward()
+                    # If you don't want to update the model, you can skip `optimizer.step()`, and set train mode False.
+                    optimizer.step()
+                model.train(mode=training)
+    traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
+        The traced optimizer instance which the optimizer class is wrapped by nni.trace.
+        E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``..
+    criterion : Callable[[Tensor, Tensor], Tensor]
+        The criterion function used in trainer. Take model output and target value as input, and return the loss.
+    training_batches
+        The batch number used to collect activations.
+    mode : str
+        'normal' or 'dependency_aware'.
+        If prune the model in a dependency-aware way, this pruner will
+        prune the model according to the activation-based metrics and the channel-dependency or
+        group-dependency of the model. In this way, the pruner will force the conv layers
+        that have dependencies to prune the same channels, so the speedup module can better
+        harvest the speed benefit from the pruned model. Note that, if set 'dependency_aware'
+        , the dummy_input cannot be None, because the pruner needs a dummy input to trace the
+        dependency between the conv layers.
+    dummy_input : Optional[torch.Tensor]
+        The dummy input to analyze the topology constraints. Note that, the dummy_input
+        should on the same device with the model.
+    Examples
+    --------
+        >>> import nni
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import ActivationMeanRankPruner
+        >>> model = ...
+        >>> # make sure you have used nni.trace to wrap the optimizer class before initialize
+        >>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
+        >>> trainer = ...
+        >>> criterion = ...
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> pruner = ActivationMeanRankPruner(model, config_list, trainer, traced_optimizer, criterion, training_batches=20)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/activation_pruning_torch.py <examples/model_compress/pruning/v2/activation_pruning_torch.py>`
+    """
    def _activation_trans(self, output: Tensor) -> Tensor:
        # return the activation of `output` directly.
        return self._activation(output.detach())
@@ -645,11 +868,21 @@ class ActivationMeanRankPruner(ActivationPruner):
 class TaylorFOWeightPruner(BasicPruner):
-    """
+    r"""
+    Taylor FO weight pruner is a pruner which prunes on the first weight dimension,
+    based on estimated importance calculated from the first order taylor expansion on weights to achieve a preset level of network sparsity.
+    The estimated importance is defined as the paper `Importance Estimation for Neural Network Pruning <http://jankautz.com/publications/Importance4NNPruning_CVPR19.pdf>`__.
+    :math:`\widehat{\mathcal{I}}_{\mathcal{S}}^{(1)}(\mathbf{W}) \triangleq \sum_{s \in \mathcal{S}} \mathcal{I}_{s}^{(1)}(\mathbf{W})=\sum_{s \in \mathcal{S}}\left(g_{s} w_{s}\right)^{2}`
+    Taylor FO weight pruner also supports dependency-aware mode.
+    What's more, we provide a global-sort mode for this pruner which is aligned with paper implementation.
    Parameters
    ----------
    model : torch.nn.Module
-        Model to be pruned
+        Model to be pruned.
    config_list : List[Dict]
        Supported keys:
            - sparsity : This is to specify the sparsity for each layer in this config to be compressed.
@@ -681,7 +914,7 @@ class TaylorFOWeightPruner(BasicPruner):
                model.train(mode=training)
    traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
        The traced optimizer instance which the optimizer class is wrapped by nni.trace.
-        E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()).
+        E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
    criterion : Callable[[Tensor, Tensor], Tensor]
        The criterion function used in trainer. Take model output and target value as input, and return the loss.
    training_batches : int
@@ -703,6 +936,21 @@ class TaylorFOWeightPruner(BasicPruner):
    dummy_input : Optional[torch.Tensor]
        The dummy input to analyze the topology constraints. Note that, the dummy_input
        should on the same device with the model.
+    Examples
+    --------
+        >>> import nni
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import TaylorFOWeightPruner
+        >>> model = ...
+        >>> # make sure you have used nni.trace to wrap the optimizer class before initialize
+        >>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
+        >>> trainer = ...
+        >>> criterion = ...
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> pruner = TaylorFOWeightPruner(model, config_list, trainer, traced_optimizer, criterion, training_batches=20)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/taylorfo_pruning_torch.py <examples/model_compress/pruning/v2/taylorfo_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], trainer: Callable[[Module, Optimizer, Callable], None],
@@ -772,13 +1020,17 @@ class TaylorFOWeightPruner(BasicPruner):
 class ADMMPruner(BasicPruner):
-    """
+    r"""
-    ADMM (Alternating Direction Method of Multipliers) Pruner is a kind of mathematical optimization technique.
+    Alternating Direction Method of Multipliers (ADMM) is a mathematical optimization technique,
-    The metric used in this pruner is the absolute value of the weight.
+    by decomposing the original nonconvex problem into two subproblems that can be solved iteratively.
-    In each iteration, the weight with small magnitudes will be set to zero.
+    In weight pruning problem, these two subproblems are solved via 1) gradient descent algorithm and 2) Euclidean projection respectively. 
-    Only in the final iteration, the mask will be generated and apply to model wrapper.
+    During the process of solving these two subproblems, the weights of the original model will be changed.
+    Then a fine-grained pruning will be applied to prune the model according to the config list given.
-    The original paper refer to: https://arxiv.org/abs/1804.03294.
+    This solution framework applies both to non-structured and different variations of structured pruning schemes.
+    For more details, please refer to `A Systematic DNN Weight Pruning Framework using Alternating Direction Method of Multipliers <https://arxiv.org/abs/1804.03294>`__.
    Parameters
    ----------
@@ -814,13 +1066,28 @@ class ADMMPruner(BasicPruner):
                model.train(mode=training)
    traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
        The traced optimizer instance which the optimizer class is wrapped by nni.trace.
-        E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()).
+        E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
    criterion : Callable[[Tensor, Tensor], Tensor]
        The criterion function used in trainer. Take model output and target value as input, and return the loss.
    iterations : int
        The total iteration number in admm pruning algorithm.
    training_epochs : int
        The epoch number for training model in each iteration.
+    Examples
+    --------
+        >>> import nni
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import ADMMPruner
+        >>> model = ...
+        >>> # make sure you have used nni.trace to wrap the optimizer class before initialize
+        >>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
+        >>> trainer = ...
+        >>> criterion = ...
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> pruner = ADMMPruner(model, config_list, trainer, traced_optimizer, criterion, iterations=10, training_epochs=1)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/admm_pruning_torch.py <examples/model_compress/pruning/v2/admm_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], trainer: Callable[[Module, Optimizer, Callable], None],

--- a/nni/algorithms/compression/v2/pytorch/pruning/iterative_pruner.py
+++ b/nni/algorithms/compression/v2/pytorch/pruning/iterative_pruner.py
@@ -70,7 +70,11 @@ class IterativePruner(PruningScheduler):
 class LinearPruner(IterativePruner):
-    """
+    r"""
+    Linear pruner is an iterative pruner, it will increase sparsity evenly from scratch during each iteration.
+    For example, the final sparsity is set as 0.5, and the iteration number is 5, then the sparsity used in each iteration are ``[0, 0.1, 0.2, 0.3, 0.4, 0.5]``.
    Parameters
    ----------
    model : Module
@@ -98,6 +102,17 @@ class LinearPruner(IterativePruner):
        If evaluator is None, the best result refers to the latest result.
    pruning_params : Dict
        If the chosen pruning_algorithm has extra parameters, put them as a dict to pass in.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import LinearPruner
+        >>> config_list = [{'sparsity': 0.8, 'op_types': ['Conv2d']}]
+        >>> finetuner = ...
+        >>> pruner = LinearPruner(model, config_list, pruning_algorithm='l1', total_iteration=10, finetuner=finetuner)
+        >>> pruner.compress()
+        >>> _, model, masks, _, _ = pruner.get_best_result()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/iterative_pruning_torch.py <examples/model_compress/pruning/v2/iterative_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], pruning_algorithm: str,
@@ -117,7 +132,14 @@ class LinearPruner(IterativePruner):
 class AGPPruner(IterativePruner):
-    """
+    r"""
+    This is an iterative pruner, which the sparsity is increased from an initial sparsity value :math:`s_{i}` (usually 0) to a final sparsity value :math:`s_{f}` over a span of :math:`n` pruning iterations,
+    starting at training step :math:`t_{0}` and with pruning frequency :math:`\Delta t`:
+    :math:`s_{t}=s_{f}+\left(s_{i}-s_{f}\right)\left(1-\frac{t-t_{0}}{n \Delta t}\right)^{3} \text { for } t \in\left\{t_{0}, t_{0}+\Delta t, \ldots, t_{0} + n \Delta t\right\}`
+    For more details please refer to `To prune, or not to prune: exploring the efficacy of pruning for model compression <https://arxiv.org/abs/1710.01878>`__\.
    Parameters
    ----------
    model : Module
@@ -145,6 +167,17 @@ class AGPPruner(IterativePruner):
        If evaluator is None, the best result refers to the latest result.
    pruning_params : Dict
        If the chosen pruning_algorithm has extra parameters, put them as a dict to pass in.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import AGPPruner
+        >>> config_list = [{'sparsity': 0.8, 'op_types': ['Conv2d']}]
+        >>> finetuner = ...
+        >>> pruner = AGPPruner(model, config_list, pruning_algorithm='l1', total_iteration=10, finetuner=finetuner)
+        >>> pruner.compress()
+        >>> _, model, masks, _, _ = pruner.get_best_result()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/iterative_pruning_torch.py <examples/model_compress/pruning/v2/iterative_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], pruning_algorithm: str,
@@ -164,7 +197,25 @@ class AGPPruner(IterativePruner):
 class LotteryTicketPruner(IterativePruner):
-    """
+    r"""
+    `The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks <https://arxiv.org/abs/1803.03635>`__\ ,
+    authors Jonathan Frankle and Michael Carbin,provides comprehensive measurement and analysis,
+    and articulate the *lottery ticket hypothesis*\ : dense, randomly-initialized, feed-forward networks contain subnetworks (*winning tickets*\ ) that
+    -- when trained in isolation -- reach test accuracy comparable to the original network in a similar number of iterations.
+    In this paper, the authors use the following process to prune a model, called *iterative prunning*\ :
+    ..
+        #. Randomly initialize a neural network f(x;theta_0) (where theta\ *0 follows D*\ {theta}).
+        #. Train the network for j iterations, arriving at parameters theta_j.
+        #. Prune p% of the parameters in theta_j, creating a mask m.
+        #. Reset the remaining parameters to their values in theta_0, creating the winning ticket f(x;m*theta_0).
+        #. Repeat step 2, 3, and 4.
+    If the configured final sparsity is P (e.g., 0.8) and there are n times iterative pruning,
+    each iterative pruning prunes 1-(1-P)^(1/n) of the weights that survive the previous round.
    Parameters
    ----------
    model : Module
@@ -194,6 +245,18 @@ class LotteryTicketPruner(IterativePruner):
        If set True, the model weight will reset to the original model weight at the end of each iteration step.
    pruning_params : Dict
        If the chosen pruning_algorithm has extra parameters, put them as a dict to pass in.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import LotteryTicketPruner
+        >>> config_list = [{'sparsity': 0.8, 'op_types': ['Conv2d']}]
+        >>> finetuner = ...
+        >>> pruner = LotteryTicketPruner(model, config_list, pruning_algorithm='l1', total_iteration=10, finetuner=finetuner, reset_weight=True)
+        >>> pruner.compress()
+        >>> _, model, masks, _, _ = pruner.get_best_result()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/iterative_pruning_torch.py <examples/model_compress/pruning/v2/iterative_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], pruning_algorithm: str,
@@ -215,6 +278,19 @@ class LotteryTicketPruner(IterativePruner):
 class SimulatedAnnealingPruner(IterativePruner):
    """
+    We implement a guided heuristic search method, Simulated Annealing (SA) algorithm. As mentioned in the paper, this method is enhanced on guided search based on prior experience.
+    The enhanced SA technique is based on the observation that a DNN layer with more number of weights often has a higher degree of model compression with less impact on overall accuracy.
+    * Randomly initialize a pruning rate distribution (sparsities).
+    * While current_temperature < stop_temperature:
+        #. generate a perturbation to current distribution
+        #. Perform fast evaluation on the perturbated distribution
+        #. accept the perturbation according to the performance and probability, if not accepted, return to step 1
+        #. cool down, current_temperature <- current_temperature * cool_down_rate
+    For more details, please refer to `AutoCompress: An Automatic DNN Structured Pruning Framework for Ultra-High Compression Rates <https://arxiv.org/abs/1907.03141>`__.
    Parameters
    ----------
    model : Module
@@ -246,6 +322,19 @@ class SimulatedAnnealingPruner(IterativePruner):
        If set True, speed up the model at the end of each iteration to make the pruned model compact.
    dummy_input : Optional[torch.Tensor]
        If `speed_up` is True, `dummy_input` is required for tracing the model in speed up.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import SimulatedAnnealingPruner
+        >>> model = ...
+        >>> config_list = [{'sparsity': 0.8, 'op_types': ['Conv2d']}]
+        >>> evaluator = ...
+        >>> finetuner = ...
+        >>> pruner = SimulatedAnnealingPruner(model, config_list, pruning_algorithm='l1', evaluator=evaluator, cool_down_rate=0.9, finetuner=finetuner)
+        >>> pruner.compress()
+        >>> _, model, masks, _, _ = pruner.get_best_result()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/simulated_anealing_pruning_torch.py <examples/model_compress/pruning/v2/simulated_anealing_pruning_torch.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], evaluator: Callable[[Module], float], start_temperature: float = 100,

--- a/nni/algorithms/compression/v2/pytorch/pruning/movement_pruner.py
+++ b/nni/algorithms/compression/v2/pytorch/pruning/movement_pruner.py
@@ -124,7 +124,21 @@ class WeightScoreTrainerBasedDataCollector(TrainerBasedDataCollector):
 class MovementPruner(BasicPruner):
-    """
+    r"""
+    Movement pruner is an implementation of movement pruning.
+    This is a "fine-pruning" algorithm, which means the masks may change during each fine-tuning step.
+    Each weight element will be scored by the opposite of the sum of the product of weight and its gradient during each step.
+    This means the weight elements moving towards zero will accumulate negative scores, the weight elements moving away from zero will accumulate positive scores.
+    The weight elements with low scores will be masked during inference.
+    The following figure from the paper shows the weight pruning by movement pruning.
+    .. image:: ../../img/movement_pruning.png
+        :target: ../../img/movement_pruning.png
+        :alt:
+    For more details, please refer to `Movement Pruning: Adaptive Sparsity by Fine-Tuning <https://arxiv.org/abs/2005.07683>`__.
    Parameters
    ----------
    model : torch.nn.Module
@@ -158,7 +172,7 @@ class MovementPruner(BasicPruner):
                model.train(mode=training)
    traced_optimizer : nni.common.serializer.Traceable(torch.optim.Optimizer)
        The traced optimizer instance which the optimizer class is wrapped by nni.trace.
-        E.g. traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters()).
+        E.g. ``traced_optimizer = nni.trace(torch.nn.Adam)(model.parameters())``.
    criterion : Callable[[Tensor, Tensor], Tensor]
        The criterion function used in trainer. Take model output and target value as input, and return the loss.
    training_epochs : int
@@ -171,6 +185,21 @@ class MovementPruner(BasicPruner):
        The number of steps at which sparsity stops growing, note that the sparsity stop growing doesn't mean masks not changed.
        The sparsity after each `optimizer.step()` is:
        total_sparsity * (1 - (1 - (current_step - warm_up_step) / (cool_down_beginning_step - warm_up_step)) ** 3).
+    Examples
+    --------
+        >>> import nni
+        >>> from nni.algorithms.compression.v2.pytorch.pruning import MovementPruner
+        >>> model = ...
+        >>> # make sure you have used nni.trace to wrap the optimizer class before initialize
+        >>> traced_optimizer = nni.trace(torch.optim.Adam)(model.parameters())
+        >>> trainer = ...
+        >>> criterion = ...
+        >>> config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'] }]
+        >>> pruner = MovementPruner(model, config_list, trainer, traced_optimizer, criterion, 10, 3000, 27000)
+        >>> masked_model, masks = pruner.compress()
+    For detailed example please refer to :githublink:`examples/model_compress/pruning/v2/movement_pruning_glue.py <examples/model_compress/pruning/v2/movement_pruning_glue.py>`
    """
    def __init__(self, model: Module, config_list: List[Dict], trainer: Callable[[Module, Optimizer, Callable], None],
                 traced_optimizer: Traceable, criterion: Callable[[Tensor, Tensor], Tensor], training_epochs: int, warm_up_step: int,