[Doc] update compression tutorials (#4646)

6e09c2c1 · J-shang · GitHub · a4d8a4ea · 6e09c2c1 · 6e09c2c1
Unverified Commit 6e09c2c1 authored Mar 16, 2022 by J-shang Committed by GitHub Mar 16, 2022
20 changed files
--- a/docs/extension/tutorial_links.py
+++ b/docs/extension/tutorial_links.py
@@ -5,7 +5,11 @@ import re


 cp_list = {
-    'tutorials/hello_nas.rst': 'tutorials/cp_hello_nas_quickstart.rst'
+    'tutorials/hello_nas.rst': 'tutorials/cp_hello_nas_quickstart.rst',
+    'tutorials/pruning_quick_start_mnist.rst': 'tutorials/cp_pruning_quick_start_mnist.rst',
+    'tutorials/pruning_speed_up.rst': 'tutorials/cp_pruning_speed_up.rst',
+    'tutorials/quantization_quick_start_mnist.rst': 'tutorials/cp_quantization_quick_start_mnist.rst',
+    'tutorials/quantization_speed_up.rst': 'tutorials/cp_quantization_speed_up.rst',
 }

 HEADER = """.. THIS FILE IS A COPY OF {} WITH MODIFICATIONS.

--- a/docs/source/compression/index.rst
+++ b/docs/source/compression/index.rst
@@ -10,12 +10,6 @@ Model Compression with NNI
    Config Specification <compression_config_list>
    Advanced Usage <advanced_usage>

-.. attention::
-
-  NNI's model pruning framework has been upgraded to a more powerful version (named pruning v2 before nni v2.6).
-  The old version (`named pruning before nni v2.6 <https://nni.readthedocs.io/en/v2.6/Compression/pruning.html>`_) will be out of maintenance. If for some reason you have to use the old pruning,
-  v2.6 is the last nni version to support old pruning version.
-
 .. Using rubric to prevent the section heading to be include into toc

 .. rubric:: Overview
@@ -147,4 +141,10 @@ The following figure shows how NNI prunes and speeds up your models.
   :alt:

 The detailed tutorial of Speed Up Model with Mask can be found :doc:`here <../tutorials/pruning_speed_up>`.
-The detailed tutorial of Speed Up Model with Calibration Config can be found :doc:`here <../tutorials/quantization_speed_up>`.
\ No newline at end of file
+The detailed tutorial of Speed Up Model with Calibration Config can be found :doc:`here <../tutorials/quantization_speed_up>`.
+
+.. attention::
+
+  NNI's model pruning framework has been upgraded to a more powerful version (named pruning v2 before nni v2.6).
+  The old version (`named pruning before nni v2.6 <https://nni.readthedocs.io/en/v2.6/Compression/pruning.html>`_) will be out of maintenance. If for some reason you have to use the old pruning,
+  v2.6 is the last nni version to support old pruning version.
--- a/docs/source/compression/index_zh.rst
+++ b/docs/source/compression/index_zh.rst
-.. 1d14b9d13cdd660f8e9dcb2abed0b185
+.. cacd7e0a78bfacc867ee868c07c1d700

 模型压缩
 ========

--- a/docs/source/compression/pruning.rst
+++ b/docs/source/compression/pruning.rst
@@ -103,6 +103,6 @@ In the dependency-aware mode, the pruner will provide a better speed gain from t
    :hidden:
    :maxdepth: 2

-    Quickstart <../tutorials/pruning_quick_start_mnist>
-    Pruner Reference <pruner>
-    Speed Up <../tutorials/pruning_speed_up>
+    Quickstart <../tutorials/cp_pruning_quick_start_mnist>
+    Pruner <pruner>
+    Speed Up <../tutorials/cp_pruning_speed_up>
--- a/docs/source/compression/quantization.rst
+++ b/docs/source/compression/quantization.rst
@@ -14,6 +14,6 @@ create your own quantizer using NNI model compression interface.
    :hidden:
    :maxdepth: 2

-    Quickstart <../tutorials/quantization_quick_start_mnist>
-    Quantizer Reference <quantizer>
-    Speed Up <../tutorials/quantization_speed_up>
+    Quickstart <../tutorials/cp_quantization_quick_start_mnist>
+    Quantizer <quantizer>
+    Speed Up <../tutorials/cp_quantization_speed_up>
--- a/docs/source/tutorials/index.rst
+++ b/docs/source/tutorials/index.rst
@@ -53,14 +53,14 @@ Tutorials

 .. raw:: html

-    <div class="sphx-glr-thumbcontainer" tooltip="Model pruning is a technique to reduce the model size and computation by reducing model weight ...">
+    <div class="sphx-glr-thumbcontainer" tooltip="Quantization reduces model size and speeds up inference time by reducing the number of bits req...">

 .. only:: html

- .. figure:: /tutorials/images/thumb/sphx_glr_pruning_quick_start_mnist_thumb.png
-     :alt: Pruning Quickstart
+ .. figure:: /tutorials/images/thumb/sphx_glr_quantization_quick_start_mnist_thumb.png
+     :alt: Quantization Quickstart

-     :ref:`sphx_glr_tutorials_pruning_quick_start_mnist.py`
+     :ref:`sphx_glr_tutorials_quantization_quick_start_mnist.py`

 .. raw:: html

@@ -70,18 +70,18 @@ Tutorials
 .. toctree::
   :hidden:

-   /tutorials/pruning_quick_start_mnist
+   /tutorials/quantization_quick_start_mnist

 .. raw:: html

-    <div class="sphx-glr-thumbcontainer" tooltip="Quantization reduces model size and speeds up inference time by reducing the number of bits req...">
+    <div class="sphx-glr-thumbcontainer" tooltip=" Introduction ------------">

 .. only:: html

- .. figure:: /tutorials/images/thumb/sphx_glr_quantization_quick_start_mnist_thumb.png
-     :alt: Quantization Quickstart
+ .. figure:: /tutorials/images/thumb/sphx_glr_quantization_speed_up_thumb.png
+     :alt: Speed Up Model with Calibration Config

-     :ref:`sphx_glr_tutorials_quantization_quick_start_mnist.py`
+     :ref:`sphx_glr_tutorials_quantization_speed_up.py`

 .. raw:: html

@@ -91,18 +91,18 @@ Tutorials
 .. toctree::
   :hidden:

-   /tutorials/quantization_quick_start_mnist
+   /tutorials/quantization_speed_up

 .. raw:: html

-    <div class="sphx-glr-thumbcontainer" tooltip=" Introduction ------------">
+    <div class="sphx-glr-thumbcontainer" tooltip="Model pruning is a technique to reduce the model size and computation by reducing model weight ...">

 .. only:: html

- .. figure:: /tutorials/images/thumb/sphx_glr_quantization_speed_up_thumb.png
-     :alt: Speed Up Model with Calibration Config
+ .. figure:: /tutorials/images/thumb/sphx_glr_pruning_quick_start_mnist_thumb.png
+     :alt: Pruning Quickstart

-     :ref:`sphx_glr_tutorials_quantization_speed_up.py`
+     :ref:`sphx_glr_tutorials_pruning_quick_start_mnist.py`

 .. raw:: html

@@ -112,7 +112,7 @@ Tutorials
 .. toctree::
   :hidden:

-   /tutorials/quantization_speed_up
+   /tutorials/pruning_quick_start_mnist

 .. raw:: html


--- a/docs/source/tutorials/pruning_quick_start_mnist.ipynb
+++ b/docs/source/tutorials/pruning_quick_start_mnist.ipynb
@@ -33,7 +33,18 @@
      },
      "outputs": [],
      "source": [
-        "import torch\nimport torch.nn.functional as F\nfrom torch.optim import SGD\n\nfrom scripts.compression_mnist_model import TorchModel, trainer, evaluator, device\n\n# define the model\nmodel = TorchModel().to(device)\n\n# define the optimizer and criterion for pre-training\n\noptimizer = SGD(model.parameters(), 1e-2)\ncriterion = F.nll_loss\n\n# pre-train and evaluate the model on MNIST dataset\nfor epoch in range(3):\n    trainer(model, optimizer, criterion)\n    evaluator(model)"
+        "import torch\nimport torch.nn.functional as F\nfrom torch.optim import SGD\n\nfrom scripts.compression_mnist_model import TorchModel, trainer, evaluator, device\n\n# define the model\nmodel = TorchModel().to(device)\n\n# show the model structure, note that pruner will wrap the model layer.\nprint(model)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# define the optimizer and criterion for pre-training\n\noptimizer = SGD(model.parameters(), 1e-2)\ncriterion = F.nll_loss\n\n# pre-train and evaluate the model on MNIST dataset\nfor epoch in range(3):\n    trainer(model, optimizer, criterion)\n    evaluator(model)"
      ]
    },
    {
@@ -69,7 +80,18 @@
      },
      "outputs": [],
      "source": [
-        "from nni.algorithms.compression.v2.pytorch.pruning import L1NormPruner\n\npruner = L1NormPruner(model, config_list)\n# show the wrapped model structure\nprint(model)\n# compress the model and generate the masks\n_, masks = pruner.compress()\n# show the masks sparsity\nfor name, mask in masks.items():\n    print(name, ' sparsity: ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))"
+        "from nni.algorithms.compression.v2.pytorch.pruning import L1NormPruner\npruner = L1NormPruner(model, config_list)\n\n# show the wrapped model structure, `PrunerModuleWrapper` have wrapped the layers that configured in the config_list.\nprint(model)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# compress the model and generate the masks\n_, masks = pruner.compress()\n# show the masks sparsity\nfor name, mask in masks.items():\n    print(name, ' sparsity : ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))"
      ]
    },
    {

--- a/docs/source/tutorials/pruning_quick_start_mnist.py
+++ b/docs/source/tutorials/pruning_quick_start_mnist.py
@@ -29,6 +29,11 @@ from scripts.compression_mnist_model import TorchModel, trainer, evaluator, devi
 # define the model
 model = TorchModel().to(device)

+# show the model structure, note that pruner will wrap the model layer.
+print(model)
+
+# %%
+
 # define the optimizer and criterion for pre-training

 optimizer = SGD(model.parameters(), 1e-2)
@@ -63,15 +68,18 @@ config_list = [{
 # Pruners usually require `model` and `config_list` as input arguments.

 from nni.algorithms.compression.v2.pytorch.pruning import L1NormPruner
-
 pruner = L1NormPruner(model, config_list)
-# show the wrapped model structure
+
+# show the wrapped model structure, `PrunerModuleWrapper` have wrapped the layers that configured in the config_list.
 print(model)
+
+# %%
+
 # compress the model and generate the masks
 _, masks = pruner.compress()
 # show the masks sparsity
 for name, mask in masks.items():
-    print(name, ' sparsity: ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))
+    print(name, ' sparsity : ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))

 # %%
 # Speed up the original model with masks, note that `ModelSpeedup` requires an unwrapped model.

--- a/docs/source/tutorials/pruning_quick_start_mnist.py.md5
+++ b/docs/source/tutorials/pruning_quick_start_mnist.py.md5
-775624e7a28ae5c6eb2027eace7fff67
\ No newline at end of file
+c87607b7befe8496829a8cb5a8632019
\ No newline at end of file
--- a/docs/source/tutorials/pruning_quick_start_mnist.rst
+++ b/docs/source/tutorials/pruning_quick_start_mnist.rst
@@ -39,7 +39,7 @@ Preparation
 In this tutorial, we use a simple model and pre-train on MNIST dataset.
 If you are familiar with defining a model and training in pytorch, you can skip directly to `Pruning Model`_.

-.. GENERATED FROM PYTHON SOURCE LINES 22-42
+.. GENERATED FROM PYTHON SOURCE LINES 22-35

 .. code-block:: default

@@ -53,6 +53,35 @@ If you are familiar with defining a model and training in pytorch, you can skip
    # define the model
    model = TorchModel().to(device)

+    # show the model structure, note that pruner will wrap the model layer.
+    print(model)
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    TorchModel(
+      (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
+      (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
+      (fc1): Linear(in_features=256, out_features=120, bias=True)
+      (fc2): Linear(in_features=120, out_features=84, bias=True)
+      (fc3): Linear(in_features=84, out_features=10, bias=True)
+    )
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 36-47
+
+.. code-block:: default
+
+
    # define the optimizer and criterion for pre-training

    optimizer = SGD(model.parameters(), 1e-2)
@@ -73,14 +102,14 @@ If you are familiar with defining a model and training in pytorch, you can skip

 .. code-block:: none

-    Average test loss: 1.8381, Accuracy: 5939/10000 (59%)
-    Average test loss: 0.3143, Accuracy: 9045/10000 (90%)
-    Average test loss: 0.1928, Accuracy: 9387/10000 (94%)
+    Average test loss: 0.5603, Accuracy: 8270/10000 (83%)
+    Average test loss: 0.2395, Accuracy: 9289/10000 (93%)
+    Average test loss: 0.1660, Accuracy: 9527/10000 (95%)




-.. GENERATED FROM PYTHON SOURCE LINES 43-53
+.. GENERATED FROM PYTHON SOURCE LINES 48-58

 Pruning Model
 -------------
@@ -93,7 +122,7 @@ This `config_list` means all layers whose type is `Linear` or `Conv2d` will be p
 except the layer named `fc3`, because `fc3` is `exclude`.
 The final sparsity ratio for each layer is 50%. The layer named `fc3` will not be pruned.

-.. GENERATED FROM PYTHON SOURCE LINES 53-62
+.. GENERATED FROM PYTHON SOURCE LINES 58-67

 .. code-block:: default

@@ -113,25 +142,20 @@ The final sparsity ratio for each layer is 50%. The layer named `fc3` will not b



-.. GENERATED FROM PYTHON SOURCE LINES 63-64
+.. GENERATED FROM PYTHON SOURCE LINES 68-69

 Pruners usually require `model` and `config_list` as input arguments.

-.. GENERATED FROM PYTHON SOURCE LINES 64-76
+.. GENERATED FROM PYTHON SOURCE LINES 69-76

 .. code-block:: default


    from nni.algorithms.compression.v2.pytorch.pruning import L1NormPruner
-
    pruner = L1NormPruner(model, config_list)
-    # show the wrapped model structure
+
+    # show the wrapped model structure, `PrunerModuleWrapper` have wrapped the layers that configured in the config_list.
    print(model)
-    # compress the model and generate the masks
-    _, masks = pruner.compress()
-    # show the masks sparsity
-    for name, mask in masks.items():
-        print(name, ' sparsity: ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))



@@ -158,21 +182,46 @@ Pruners usually require `model` and `config_list` as input arguments.
      )
      (fc3): Linear(in_features=84, out_features=10, bias=True)
    )
-    conv1  sparsity:  0.5
-    conv2  sparsity:  0.5
-    fc1  sparsity:  0.5
-    fc2  sparsity:  0.5




-.. GENERATED FROM PYTHON SOURCE LINES 77-80
+.. GENERATED FROM PYTHON SOURCE LINES 77-84
+
+.. code-block:: default
+
+
+    # compress the model and generate the masks
+    _, masks = pruner.compress()
+    # show the masks sparsity
+    for name, mask in masks.items():
+        print(name, ' sparsity : ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    conv1  sparsity :  0.5
+    conv2  sparsity :  0.5
+    fc1  sparsity :  0.5
+    fc2  sparsity :  0.5
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 85-88

 Speed up the original model with masks, note that `ModelSpeedup` requires an unwrapped model.
 The model becomes smaller after speed-up,
 and reaches a higher sparsity ratio because `ModelSpeedup` will propagate the masks across layers.

-.. GENERATED FROM PYTHON SOURCE LINES 80-89
+.. GENERATED FROM PYTHON SOURCE LINES 88-97

 .. code-block:: default

@@ -195,24 +244,19 @@ and reaches a higher sparsity ratio because `ModelSpeedup` will propagate the ma

 .. code-block:: none

-    /home/ningshang/nni/nni/compression/pytorch/utils/mask_conflict.py:124: UserWarning: This overload of nonzero is deprecated:
-            nonzero()
-    Consider using one of the following signatures instead:
-            nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
-      all_ones = (w_mask.flatten(1).sum(-1) == count).nonzero().squeeze(1).tolist()
-    /home/ningshang/nni/nni/compression/pytorch/speedup/infer_mask.py:262: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the gradient for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations.
-      if isinstance(self.output, torch.Tensor) and self.output.grad is not None:
-    /home/ningshang/nni/nni/compression/pytorch/speedup/compressor.py:282: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the gradient for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations.
-      if last_output.grad is not None and tin.grad is not None:
+    aten::log_softmax is not Supported! Please report an issue at https://github.com/microsoft/nni. Thanks~
+    Note: .aten::log_softmax.12 does not have corresponding mask inference object
+    /home/ningshang/anaconda3/envs/nni-dev/lib/python3.8/site-packages/torch/_tensor.py:1013: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at  aten/src/ATen/core/TensorBody.h:417.)
+      return self._grad




-.. GENERATED FROM PYTHON SOURCE LINES 90-91
+.. GENERATED FROM PYTHON SOURCE LINES 98-99

 the model will become real smaller after speed up

-.. GENERATED FROM PYTHON SOURCE LINES 91-93
+.. GENERATED FROM PYTHON SOURCE LINES 99-101

 .. code-block:: default

@@ -239,14 +283,14 @@ the model will become real smaller after speed up



-.. GENERATED FROM PYTHON SOURCE LINES 94-98
+.. GENERATED FROM PYTHON SOURCE LINES 102-106

 Fine-tuning Compacted Model
 ---------------------------
 Note that if the model has been sped up, you need to re-initialize a new optimizer for fine-tuning.
 Because speed up will replace the masked big layers with dense small ones.

-.. GENERATED FROM PYTHON SOURCE LINES 98-102
+.. GENERATED FROM PYTHON SOURCE LINES 106-110

 .. code-block:: default

@@ -264,7 +308,7 @@ Because speed up will replace the masked big layers with dense small ones.

 .. rst-class:: sphx-glr-timing

-   **Total running time of the script:** ( 1 minutes  15.845 seconds)
+   **Total running time of the script:** ( 1 minutes  38.705 seconds)


 .. _sphx_glr_download_tutorials_pruning_quick_start_mnist.py:

--- a/docs/source/tutorials/pruning_quick_start_mnist_codeobj.pickle
+++ b/docs/source/tutorials/pruning_quick_start_mnist_codeobj.pickle
--- a/docs/source/tutorials/pruning_speed_up.ipynb
+++ b/docs/source/tutorials/pruning_speed_up.ipynb
@@ -112,7 +112,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,\nplease refer to `Pruning Quick Start <./pruning_quick_start_mnist.html>`__.\n\nNOTE: The current implementation supports PyTorch 1.3.1 or newer.\n\n## Limitations\n\nFor PyTorch we can only replace modules, if functions in ``forward`` should be replaced,\nour current implementation does not work. One workaround is make the function a PyTorch module.\n\nIf you want to speed up your own model which cannot supported by the current implementation,\nyou need implement the replace function for module replacement, welcome to contribute.\n\n## Speedup Results of Examples\n\nThe code of these experiments can be found :githublink:`here <examples/model_compress/pruning/speedup/model_speedup.py>`.\n\nThese result are tested on the `legacy pruning framework <../comporession/pruning_legacy>`__, new results will coming soon.\n\n### slim pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01197\n     - 0.005107\n   * - 2\n     - 0.02019\n     - 0.008769\n   * - 4\n     - 0.02733\n     - 0.014809\n   * - 8\n     - 0.04310\n     - 0.027441\n   * - 16\n     - 0.07731\n     - 0.05008\n   * - 32\n     - 0.14464\n     - 0.10027\n\n### fpgm pruner example\n\non cpu,\ninput tensor: ``torch.randn(64, 1, 28, 28)``\\ ,\ntoo large variance\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01383\n     - 0.01839\n   * - 2\n     - 0.01167\n     - 0.003558\n   * - 4\n     - 0.01636\n     - 0.01088\n   * - 40\n     - 0.14412\n     - 0.08268\n   * - 40\n     - 1.29385\n     - 0.14408\n   * - 40\n     - 0.41035\n     - 0.46162\n   * - 400\n     - 6.29020\n     - 5.82143\n\n### l1filter pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01026\n     - 0.003677\n   * - 2\n     - 0.01657\n     - 0.008161\n   * - 4\n     - 0.02458\n     - 0.020018\n   * - 8\n     - 0.03498\n     - 0.025504\n   * - 16\n     - 0.06757\n     - 0.047523\n   * - 32\n     - 0.10487\n     - 0.086442\n\n### APoZ pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01389\n     - 0.004208\n   * - 2\n     - 0.01628\n     - 0.008310\n   * - 4\n     - 0.02521\n     - 0.014008\n   * - 8\n     - 0.03386\n     - 0.023923\n   * - 16\n     - 0.06042\n     - 0.046183\n   * - 32\n     - 0.12421\n     - 0.087113\n\n### SimulatedAnnealing pruner example\n\nIn this experiment, we use SimulatedAnnealing pruner to prune the resnet18 on the cifar10 dataset.\nWe measure the latencies and accuracies of the pruned model under different sparsity ratios, as shown in the following figure.\nThe latency is measured on one V100 GPU and the input tensor is  ``torch.randn(128, 3, 32, 32)``.\n\n<img src=\"file://../../img/SA_latency_accuracy.png\">\n\n### User configuration for ModelSpeedup\n\n**PyTorch**\n\n..  autoclass:: nni.compression.pytorch.ModelSpeedup\n\n"
+        "For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,\nplease refer to `Pruning Quick Start <./pruning_quick_start_mnist.html>`__.\n\nNOTE: The current implementation supports PyTorch 1.3.1 or newer.\n\n## Limitations\n\nFor PyTorch we can only replace modules, if functions in ``forward`` should be replaced,\nour current implementation does not work. One workaround is make the function a PyTorch module.\n\nIf you want to speed up your own model which cannot supported by the current implementation,\nyou need implement the replace function for module replacement, welcome to contribute.\n\n## Speedup Results of Examples\n\nThe code of these experiments can be found :githublink:`here <examples/model_compress/pruning/speedup/model_speedup.py>`.\n\nThese result are tested on the `legacy pruning framework <../comporession/pruning_legacy>`__, new results will coming soon.\n\n### slim pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01197\n     - 0.005107\n   * - 2\n     - 0.02019\n     - 0.008769\n   * - 4\n     - 0.02733\n     - 0.014809\n   * - 8\n     - 0.04310\n     - 0.027441\n   * - 16\n     - 0.07731\n     - 0.05008\n   * - 32\n     - 0.14464\n     - 0.10027\n\n### fpgm pruner example\n\non cpu,\ninput tensor: ``torch.randn(64, 1, 28, 28)``\\ ,\ntoo large variance\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01383\n     - 0.01839\n   * - 2\n     - 0.01167\n     - 0.003558\n   * - 4\n     - 0.01636\n     - 0.01088\n   * - 40\n     - 0.14412\n     - 0.08268\n   * - 40\n     - 1.29385\n     - 0.14408\n   * - 40\n     - 0.41035\n     - 0.46162\n   * - 400\n     - 6.29020\n     - 5.82143\n\n### l1filter pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01026\n     - 0.003677\n   * - 2\n     - 0.01657\n     - 0.008161\n   * - 4\n     - 0.02458\n     - 0.020018\n   * - 8\n     - 0.03498\n     - 0.025504\n   * - 16\n     - 0.06757\n     - 0.047523\n   * - 32\n     - 0.10487\n     - 0.086442\n\n### APoZ pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01389\n     - 0.004208\n   * - 2\n     - 0.01628\n     - 0.008310\n   * - 4\n     - 0.02521\n     - 0.014008\n   * - 8\n     - 0.03386\n     - 0.023923\n   * - 16\n     - 0.06042\n     - 0.046183\n   * - 32\n     - 0.12421\n     - 0.087113\n\n### SimulatedAnnealing pruner example\n\nIn this experiment, we use SimulatedAnnealing pruner to prune the resnet18 on the cifar10 dataset.\nWe measure the latencies and accuracies of the pruned model under different sparsity ratios, as shown in the following figure.\nThe latency is measured on one V100 GPU and the input tensor is  ``torch.randn(128, 3, 32, 32)``.\n\n<img src=\"file://../../img/SA_latency_accuracy.png\">\n\n"
      ]
    }
  ],

--- a/docs/source/tutorials/pruning_speed_up.py
+++ b/docs/source/tutorials/pruning_speed_up.py
@@ -237,10 +237,3 @@ print('Speedup Model - Elapsed Time : ', time.time() - start)
 # The latency is measured on one V100 GPU and the input tensor is  ``torch.randn(128, 3, 32, 32)``.
 #
 # .. image:: ../../img/SA_latency_accuracy.png
-#
-# User configuration for ModelSpeedup
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# **PyTorch**
-#
-# ..  autoclass:: nni.compression.pytorch.ModelSpeedup
--- a/docs/source/tutorials/pruning_speed_up.py.md5
+++ b/docs/source/tutorials/pruning_speed_up.py.md5
-5bcdee7241d8daf931bd76f435167a58
\ No newline at end of file
+e7a923e9f98f16e2eb4f3c29c2940f49
\ No newline at end of file
--- a/docs/source/tutorials/pruning_speed_up.rst
+++ b/docs/source/tutorials/pruning_speed_up.rst
@@ -136,7 +136,7 @@ Roughly test the original model inference speed.

 .. code-block:: none

-    Original Model - Elapsed Time :  0.035959720611572266
+    Original Model - Elapsed Time :  0.10696005821228027



@@ -163,60 +163,10 @@ Speed up the model and show the model structure after speed up.

 .. code-block:: none

-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) start to speed up the model
-    [2022-02-28 13:29:56] INFO (FixMaskConflict/MainThread) {'conv1': 1, 'conv2': 1}
-    [2022-02-28 13:29:56] INFO (FixMaskConflict/MainThread) dim0 sparsity: 0.500000
-    [2022-02-28 13:29:56] INFO (FixMaskConflict/MainThread) dim1 sparsity: 0.000000
-    [2022-02-28 13:29:56] INFO (FixMaskConflict/MainThread) Dectected conv prune dim" 0
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) infer module masks...
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for conv1
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for .aten::relu.5
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for .aten::max_pool2d.6
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for conv2
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for .aten::relu.7
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for .aten::max_pool2d.8
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for .aten::flatten.9
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for fc1
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for .aten::relu.10
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for fc2
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for .aten::relu.11
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for fc3
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update mask for .aten::log_softmax.12
-    [2022-02-28 13:29:56] ERROR (nni.compression.pytorch.speedup.jit_translate/MainThread) aten::log_softmax is not Supported! Please report an issue at https://github.com/microsoft/nni. Thanks~
-    [2022-02-28 13:29:56] WARNING (nni.compression.pytorch.speedup.compressor/MainThread) Note: .aten::log_softmax.12 does not have corresponding mask inference object
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the fc3
+    aten::log_softmax is not Supported! Please report an issue at https://github.com/microsoft/nni. Thanks~
+    Note: .aten::log_softmax.12 does not have corresponding mask inference object
    /home/ningshang/anaconda3/envs/nni-dev/lib/python3.8/site-packages/torch/_tensor.py:1013: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at  aten/src/ATen/core/TensorBody.h:417.)
      return self._grad
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the .aten::relu.11
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the fc2
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the .aten::relu.10
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the fc1
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the .aten::flatten.9
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the .aten::max_pool2d.8
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the .aten::relu.7
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the conv2
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the .aten::max_pool2d.6
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the .aten::relu.5
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the conv1
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) resolve the mask conflict
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) replace compressed modules...
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) replace module (name: conv1, op_type: Conv2d)
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Warning: cannot replace (name: .aten::relu.5, op_type: aten::relu) which is func type
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Warning: cannot replace (name: .aten::max_pool2d.6, op_type: aten::max_pool2d) which is func type
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) replace module (name: conv2, op_type: Conv2d)
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Warning: cannot replace (name: .aten::relu.7, op_type: aten::relu) which is func type
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Warning: cannot replace (name: .aten::max_pool2d.8, op_type: aten::max_pool2d) which is func type
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Warning: cannot replace (name: .aten::flatten.9, op_type: aten::flatten) which is func type
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) replace module (name: fc1, op_type: Linear)
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compress_modules/MainThread) replace linear with new in_features: 256, out_features: 120
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Warning: cannot replace (name: .aten::relu.10, op_type: aten::relu) which is func type
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) replace module (name: fc2, op_type: Linear)
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compress_modules/MainThread) replace linear with new in_features: 120, out_features: 84
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Warning: cannot replace (name: .aten::relu.11, op_type: aten::relu) which is func type
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) replace module (name: fc3, op_type: Linear)
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compress_modules/MainThread) replace linear with new in_features: 84, out_features: 10
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Warning: cannot replace (name: .aten::log_softmax.12, op_type: aten::log_softmax) which is func type
-    [2022-02-28 13:29:56] INFO (nni.compression.pytorch.speedup.compressor/MainThread) speedup done
    TorchModel(
      (conv1): Conv2d(1, 3, kernel_size=(5, 5), stride=(1, 1))
      (conv2): Conv2d(3, 16, kernel_size=(5, 5), stride=(1, 1))
@@ -250,12 +200,12 @@ Roughly test the model after speed-up inference speed.

 .. code-block:: none

-    Speedup Model - Elapsed Time :  0.003432035446166992
+    Speedup Model - Elapsed Time :  0.002137899398803711




-.. GENERATED FROM PYTHON SOURCE LINES 79-247
+.. GENERATED FROM PYTHON SOURCE LINES 79-240

 For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,
 please refer to `Pruning Quick Start <./pruning_quick_start_mnist.html>`__.
@@ -419,17 +369,10 @@ The latency is measured on one V100 GPU and the input tensor is  ``torch.randn(1

 .. image:: ../../img/SA_latency_accuracy.png

-User configuration for ModelSpeedup
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-**PyTorch**
-
-..  autoclass:: nni.compression.pytorch.ModelSpeedup
-

 .. rst-class:: sphx-glr-timing

-   **Total running time of the script:** ( 0 minutes  8.409 seconds)
+   **Total running time of the script:** ( 0 minutes  9.859 seconds)


 .. _sphx_glr_download_tutorials_pruning_speed_up.py:

--- a/docs/source/tutorials/pruning_speed_up_codeobj.pickle
+++ b/docs/source/tutorials/pruning_speed_up_codeobj.pickle
--- a/docs/source/tutorials/sg_execution_times.rst
+++ b/docs/source/tutorials/sg_execution_times.rst
@@ -5,10 +5,12 @@

 Computation times
 =================
-**00:01.175** total execution time for **tutorials** files:
+**01:48.564** total execution time for **tutorials** files:

 +-----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorials_pruning_customize.py` (``pruning_customize.py``)                           | 00:01.175 | 0.0 MB |
+| :ref:`sphx_glr_tutorials_pruning_quick_start_mnist.py` (``pruning_quick_start_mnist.py``)           | 01:38.705 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_pruning_speed_up.py` (``pruning_speed_up.py``)                             | 00:09.859 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorials_hello_nas.py` (``hello_nas.py``)                                           | 00:00.000 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------------+-----------+--------+
@@ -16,9 +18,7 @@ Computation times
 +-----------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorials_nni_experiment.py` (``nni_experiment.py``)                                 | 00:00.000 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorials_pruning_quick_start_mnist.py` (``pruning_quick_start_mnist.py``)           | 00:00.000 | 0.0 MB |
-+-----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorials_pruning_speed_up.py` (``pruning_speed_up.py``)                             | 00:00.000 | 0.0 MB |
+| :ref:`sphx_glr_tutorials_pruning_customize.py` (``pruning_customize.py``)                           | 00:00.000 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorials_quantization_customize.py` (``quantization_customize.py``)                 | 00:00.000 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------------+-----------+--------+

--- a/examples/tutorials/pruning_quick_start_mnist.py
+++ b/examples/tutorials/pruning_quick_start_mnist.py
@@ -29,6 +29,11 @@ from scripts.compression_mnist_model import TorchModel, trainer, evaluator, devi
 # define the model
 model = TorchModel().to(device)

+# show the model structure, note that pruner will wrap the model layer.
+print(model)
+
+# %%
+
 # define the optimizer and criterion for pre-training

 optimizer = SGD(model.parameters(), 1e-2)
@@ -63,15 +68,18 @@ config_list = [{
 # Pruners usually require `model` and `config_list` as input arguments.

 from nni.algorithms.compression.v2.pytorch.pruning import L1NormPruner
-
 pruner = L1NormPruner(model, config_list)
-# show the wrapped model structure
+
+# show the wrapped model structure, `PrunerModuleWrapper` have wrapped the layers that configured in the config_list.
 print(model)
+
+# %%
+
 # compress the model and generate the masks
 _, masks = pruner.compress()
 # show the masks sparsity
 for name, mask in masks.items():
-    print(name, ' sparsity: ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))
+    print(name, ' sparsity : ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))

 # %%
 # Speed up the original model with masks, note that `ModelSpeedup` requires an unwrapped model.

--- a/examples/tutorials/pruning_speed_up.py
+++ b/examples/tutorials/pruning_speed_up.py
@@ -237,10 +237,3 @@ print('Speedup Model - Elapsed Time : ', time.time() - start)
 # The latency is measured on one V100 GPU and the input tensor is  ``torch.randn(128, 3, 32, 32)``.
 #
 # .. image:: ../../img/SA_latency_accuracy.png
-#
-# User configuration for ModelSpeedup
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# **PyTorch**
-#
-# ..  autoclass:: nni.compression.pytorch.ModelSpeedup
--- a/nni/algorithms/compression/pytorch/quantization/bnn_quantizer.py
+++ b/nni/algorithms/compression/pytorch/quantization/bnn_quantizer.py
@@ -25,7 +25,7 @@ class BNNQuantizer(Quantizer):
    r"""
    Binarized Neural Networks, as defined in:
    `Binarized Neural Networks: Training Deep Neural Networks with Weights and
-    Activations Constrained to +1 or -1 <https://arxiv.org/abs/1602.02830>`__\ ,
+    Activations Constrained to +1 or -1 <https://arxiv.org/abs/1602.02830>`__,

    ..