[Doc] split index to overview & toctree compression part (#4749)

fe02b808 · J-shang · GitHub · b8d029b1 · fe02b808 · fe02b808
Unverified Commit fe02b808 authored Apr 12, 2022 by J-shang Committed by GitHub Apr 12, 2022
20 changed files
--- a/docs/source/tutorials/pruning_quick_start_mnist_zh.rst
+++ b/docs/source/tutorials/pruning_quick_start_mnist_zh.rst
-.. 870da81751be2f5f1d74fbe9381f9acb
+.. 5f266ace988c9ca9e44555fdc497e9ba

    .. note::
        :class: sphx-glr-download-link-note

--- a/docs/source/tutorials/pruning_speedup.ipynb
+++ b/docs/source/tutorials/pruning_speedup.ipynb
@@ -112,7 +112,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-        "For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,\nplease refer to `Pruning Quick Start <./pruning_quick_start_mnist.html>`__.\n\nNOTE: The current implementation supports PyTorch 1.3.1 or newer.\n\n## Limitations\n\nFor PyTorch we can only replace modules, if functions in ``forward`` should be replaced,\nour current implementation does not work. One workaround is make the function a PyTorch module.\n\nIf you want to speedup your own model which cannot supported by the current implementation,\nyou need implement the replace function for module replacement, welcome to contribute.\n\n## Speedup Results of Examples\n\nThe code of these experiments can be found :githublink:`here <examples/model_compress/pruning/speedup/model_speedup.py>`.\n\nThese result are tested on the `legacy pruning framework <../comporession/pruning_legacy>`__, new results will coming soon.\n\n### slim pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01197\n     - 0.005107\n   * - 2\n     - 0.02019\n     - 0.008769\n   * - 4\n     - 0.02733\n     - 0.014809\n   * - 8\n     - 0.04310\n     - 0.027441\n   * - 16\n     - 0.07731\n     - 0.05008\n   * - 32\n     - 0.14464\n     - 0.10027\n\n### fpgm pruner example\n\non cpu,\ninput tensor: ``torch.randn(64, 1, 28, 28)``\\ ,\ntoo large variance\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01383\n     - 0.01839\n   * - 2\n     - 0.01167\n     - 0.003558\n   * - 4\n     - 0.01636\n     - 0.01088\n   * - 40\n     - 0.14412\n     - 0.08268\n   * - 40\n     - 1.29385\n     - 0.14408\n   * - 40\n     - 0.41035\n     - 0.46162\n   * - 400\n     - 6.29020\n     - 5.82143\n\n### l1filter pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01026\n     - 0.003677\n   * - 2\n     - 0.01657\n     - 0.008161\n   * - 4\n     - 0.02458\n     - 0.020018\n   * - 8\n     - 0.03498\n     - 0.025504\n   * - 16\n     - 0.06757\n     - 0.047523\n   * - 32\n     - 0.10487\n     - 0.086442\n\n### APoZ pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01389\n     - 0.004208\n   * - 2\n     - 0.01628\n     - 0.008310\n   * - 4\n     - 0.02521\n     - 0.014008\n   * - 8\n     - 0.03386\n     - 0.023923\n   * - 16\n     - 0.06042\n     - 0.046183\n   * - 32\n     - 0.12421\n     - 0.087113\n\n### SimulatedAnnealing pruner example\n\nIn this experiment, we use SimulatedAnnealing pruner to prune the resnet18 on the cifar10 dataset.\nWe measure the latencies and accuracies of the pruned model under different sparsity ratios, as shown in the following figure.\nThe latency is measured on one V100 GPU and the input tensor is  ``torch.randn(128, 3, 32, 32)``.\n\n<img src=\"file://../../img/SA_latency_accuracy.png\">\n\n"
+        "For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,\nplease refer to :doc:`Pruning Quick Start <pruning_quick_start_mnist>`.\n\nNOTE: The current implementation supports PyTorch 1.3.1 or newer.\n\n## Limitations\n\nFor PyTorch we can only replace modules, if functions in ``forward`` should be replaced,\nour current implementation does not work. One workaround is make the function a PyTorch module.\n\nIf you want to speedup your own model which cannot supported by the current implementation,\nyou need implement the replace function for module replacement, welcome to contribute.\n\n## Speedup Results of Examples\n\nThe code of these experiments can be found :githublink:`here <examples/model_compress/pruning/legacy/speedup/model_speedup.py>`.\n\nThese result are tested on the `legacy pruning framework <https://nni.readthedocs.io/en/v2.6/Compression/pruning.html>`_, new results will coming soon.\n\n### slim pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01197\n     - 0.005107\n   * - 2\n     - 0.02019\n     - 0.008769\n   * - 4\n     - 0.02733\n     - 0.014809\n   * - 8\n     - 0.04310\n     - 0.027441\n   * - 16\n     - 0.07731\n     - 0.05008\n   * - 32\n     - 0.14464\n     - 0.10027\n\n### fpgm pruner example\n\non cpu,\ninput tensor: ``torch.randn(64, 1, 28, 28)``\\ ,\ntoo large variance\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01383\n     - 0.01839\n   * - 2\n     - 0.01167\n     - 0.003558\n   * - 4\n     - 0.01636\n     - 0.01088\n   * - 40\n     - 0.14412\n     - 0.08268\n   * - 40\n     - 1.29385\n     - 0.14408\n   * - 40\n     - 0.41035\n     - 0.46162\n   * - 400\n     - 6.29020\n     - 5.82143\n\n### l1filter pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01026\n     - 0.003677\n   * - 2\n     - 0.01657\n     - 0.008161\n   * - 4\n     - 0.02458\n     - 0.020018\n   * - 8\n     - 0.03498\n     - 0.025504\n   * - 16\n     - 0.06757\n     - 0.047523\n   * - 32\n     - 0.10487\n     - 0.086442\n\n### APoZ pruner example\n\non one V100 GPU,\ninput tensor: ``torch.randn(64, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - Times\n     - Mask Latency\n     - Speedup Latency\n   * - 1\n     - 0.01389\n     - 0.004208\n   * - 2\n     - 0.01628\n     - 0.008310\n   * - 4\n     - 0.02521\n     - 0.014008\n   * - 8\n     - 0.03386\n     - 0.023923\n   * - 16\n     - 0.06042\n     - 0.046183\n   * - 32\n     - 0.12421\n     - 0.087113\n\n### SimulatedAnnealing pruner example\n\nIn this experiment, we use SimulatedAnnealing pruner to prune the resnet18 on the cifar10 dataset.\nWe measure the latencies and accuracies of the pruned model under different sparsity ratios, as shown in the following figure.\nThe latency is measured on one V100 GPU and the input tensor is  ``torch.randn(128, 3, 32, 32)``.\n\n<img src=\"file://../../img/SA_latency_accuracy.png\">\n\n"
      ]
    }
  ],
@@ -132,7 +132,7 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.8.8"
+      "version": "3.9.7"
    }
  },
  "nbformat": 4,

--- a/docs/source/tutorials/pruning_speedup.py
+++ b/docs/source/tutorials/pruning_speedup.py
@@ -77,7 +77,7 @@ print('Speedup Model - Elapsed Time : ', time.time() - start)

 # %%
 # For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,
-# please refer to `Pruning Quick Start <./pruning_quick_start_mnist.html>`__.
+# please refer to :doc:`Pruning Quick Start <pruning_quick_start_mnist>`.
 #
 # NOTE: The current implementation supports PyTorch 1.3.1 or newer.
 #
@@ -93,9 +93,9 @@ print('Speedup Model - Elapsed Time : ', time.time() - start)
 # Speedup Results of Examples
 # ---------------------------
 #
-# The code of these experiments can be found :githublink:`here <examples/model_compress/pruning/speedup/model_speedup.py>`.
+# The code of these experiments can be found :githublink:`here <examples/model_compress/pruning/legacy/speedup/model_speedup.py>`.
 #
-# These result are tested on the `legacy pruning framework <../comporession/pruning_legacy>`__, new results will coming soon.
+# These result are tested on the `legacy pruning framework <https://nni.readthedocs.io/en/v2.6/Compression/pruning.html>`_, new results will coming soon.
 #
 # slim pruner example
 # ^^^^^^^^^^^^^^^^^^^

--- a/docs/source/tutorials/pruning_speedup.py.md5
+++ b/docs/source/tutorials/pruning_speedup.py.md5
-b5fa19199a998cec748c5a3a1479374f
\ No newline at end of file
+dc5c2369666206591238118f0f746e46
\ No newline at end of file
--- a/docs/source/tutorials/pruning_speedup.rst
+++ b/docs/source/tutorials/pruning_speedup.rst
@@ -108,6 +108,12 @@ Show the original model structure.
      (fc1): Linear(in_features=256, out_features=120, bias=True)
      (fc2): Linear(in_features=120, out_features=84, bias=True)
      (fc3): Linear(in_features=84, out_features=10, bias=True)
+      (relu1): ReLU()
+      (relu2): ReLU()
+      (relu3): ReLU()
+      (relu4): ReLU()
+      (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
+      (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    )


@@ -136,7 +142,7 @@ Roughly test the original model inference speed.

 .. code-block:: none

-    Original Model - Elapsed Time :  0.13896703720092773
+    Original Model - Elapsed Time :  0.5094916820526123



@@ -165,7 +171,7 @@ Speedup the model and show the model structure after speedup.

    aten::log_softmax is not Supported! Please report an issue at https://github.com/microsoft/nni. Thanks~
    Note: .aten::log_softmax.12 does not have corresponding mask inference object
-    /home/ningshang/anaconda3/envs/nni-dev/lib/python3.8/site-packages/torch/_tensor.py:1013: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at  aten/src/ATen/core/TensorBody.h:417.)
+    /home/nishang/anaconda3/envs/MCM/lib/python3.9/site-packages/torch/_tensor.py:1013: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at  /opt/conda/conda-bld/pytorch_1640811803361/work/build/aten/src/ATen/core/TensorBody.h:417.)
      return self._grad
    TorchModel(
      (conv1): Conv2d(1, 3, kernel_size=(5, 5), stride=(1, 1))
@@ -173,6 +179,12 @@ Speedup the model and show the model structure after speedup.
      (fc1): Linear(in_features=256, out_features=120, bias=True)
      (fc2): Linear(in_features=120, out_features=84, bias=True)
      (fc3): Linear(in_features=84, out_features=10, bias=True)
+      (relu1): ReLU()
+      (relu2): ReLU()
+      (relu3): ReLU()
+      (relu4): ReLU()
+      (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
+      (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    )


@@ -200,7 +212,7 @@ Roughly test the model after speedup inference speed.

 .. code-block:: none

-    Speedup Model - Elapsed Time :  0.003123760223388672
+    Speedup Model - Elapsed Time :  0.006000041961669922



@@ -208,7 +220,7 @@ Roughly test the model after speedup inference speed.
 .. GENERATED FROM PYTHON SOURCE LINES 79-240

 For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,
-please refer to `Pruning Quick Start <./pruning_quick_start_mnist.html>`__.
+please refer to :doc:`Pruning Quick Start <pruning_quick_start_mnist>`.

 NOTE: The current implementation supports PyTorch 1.3.1 or newer.

@@ -224,9 +236,9 @@ you need implement the replace function for module replacement, welcome to contr
 Speedup Results of Examples
 ---------------------------

-The code of these experiments can be found :githublink:`here <examples/model_compress/pruning/speedup/model_speedup.py>`.
+The code of these experiments can be found :githublink:`here <examples/model_compress/pruning/legacy/speedup/model_speedup.py>`.

-These result are tested on the `legacy pruning framework <../comporession/pruning_legacy>`__, new results will coming soon.
+These result are tested on the `legacy pruning framework <https://nni.readthedocs.io/en/v2.6/Compression/pruning.html>`_, new results will coming soon.

 slim pruner example
 ^^^^^^^^^^^^^^^^^^^
@@ -372,7 +384,7 @@ The latency is measured on one V100 GPU and the input tensor is  ``torch.randn(1

 .. rst-class:: sphx-glr-timing

-   **Total running time of the script:** ( 0 minutes  12.486 seconds)
+   **Total running time of the script:** ( 0 minutes  4.528 seconds)


 .. _sphx_glr_download_tutorials_pruning_speedup.py:

--- a/docs/source/tutorials/pruning_speedup_codeobj.pickle
+++ b/docs/source/tutorials/pruning_speedup_codeobj.pickle
--- a/docs/source/tutorials/quantization_quick_start_mnist.ipynb
+++ b/docs/source/tutorials/quantization_quick_start_mnist.ipynb
@@ -33,7 +33,7 @@
      },
      "outputs": [],
      "source": [
-        "import torch\nimport torch.nn.functional as F\nfrom torch.optim import SGD\n\nfrom scripts.compression_mnist_model import TorchModel, trainer, evaluator, device\n\n# define the model\nmodel = TorchModel().to(device)\n\n# define the optimizer and criterion for pre-training\n\noptimizer = SGD(model.parameters(), 1e-2)\ncriterion = F.nll_loss\n\n# pre-train and evaluate the model on MNIST dataset\nfor epoch in range(3):\n    trainer(model, optimizer, criterion)\n    evaluator(model)"
+        "import torch\nimport torch.nn.functional as F\nfrom torch.optim import SGD\n\nfrom scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt\n\n# define the model\nmodel = TorchModel().to(device)\n\n# define the optimizer and criterion for pre-training\n\noptimizer = SGD(model.parameters(), 1e-2)\ncriterion = F.nll_loss\n\n# pre-train and evaluate the model on MNIST dataset\nfor epoch in range(3):\n    trainer(model, optimizer, criterion)\n    evaluator(model)"
      ]
    },
    {
@@ -51,7 +51,7 @@
      },
      "outputs": [],
      "source": [
-        "config_list = [{\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_names': ['conv1']\n}, {\n    'quant_types': ['output'],\n    'quant_bits': {'output': 8},\n    'op_names': ['relu1']\n}, {\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_names': ['conv2']\n}, {\n    'quant_types': ['output'],\n    'quant_bits': {'output': 8},\n    'op_names': ['relu2']\n}]"
+        "config_list = [{\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_types': ['Conv2d']\n}, {\n    'quant_types': ['output'],\n    'quant_bits': {'output': 8},\n    'op_types': ['ReLU']\n}, {\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_names': ['fc1', 'fc2']\n}]"
      ]
    },
    {
@@ -107,6 +107,24 @@
      "source": [
        "model_path = \"./log/mnist_model.pth\"\ncalibration_path = \"./log/mnist_calibration.pth\"\ncalibration_config = quantizer.export_model(model_path, calibration_path)\n\nprint(\"calibration_config: \", calibration_config)"
      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT\ninput_shape = (32, 1, 28, 28)\nengine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)\nengine.compress()\ntest_trt(engine)"
+      ]
    }
  ],
  "metadata": {
@@ -125,7 +143,7 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.8.8"
+      "version": "3.9.7"
    }
  },
  "nbformat": 4,

--- a/docs/source/tutorials/quantization_quick_start_mnist.py
+++ b/docs/source/tutorials/quantization_quick_start_mnist.py
@@ -19,7 +19,7 @@ import torch
 import torch.nn.functional as F
 from torch.optim import SGD

-from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device
+from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt

 # define the model
 model = TorchModel().to(device)
@@ -44,19 +44,15 @@ for epoch in range(3):
 config_list = [{
    'quant_types': ['input', 'weight'],
    'quant_bits': {'input': 8, 'weight': 8},
-    'op_names': ['conv1']
+    'op_types': ['Conv2d']
 }, {
    'quant_types': ['output'],
    'quant_bits': {'output': 8},
-    'op_names': ['relu1']
+    'op_types': ['ReLU']
 }, {
    'quant_types': ['input', 'weight'],
    'quant_bits': {'input': 8, 'weight': 8},
-    'op_names': ['conv2']
-}, {
-    'quant_types': ['output'],
-    'quant_bits': {'output': 8},
-    'op_names': ['relu2']
+    'op_names': ['fc1', 'fc2']
 }]

 # %%
@@ -82,3 +78,12 @@ calibration_path = "./log/mnist_calibration.pth"
 calibration_config = quantizer.export_model(model_path, calibration_path)

 print("calibration_config: ", calibration_config)
+
+# %%
+# build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.
+
+from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+input_shape = (32, 1, 28, 28)
+engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+engine.compress()
+test_trt(engine)
--- a/docs/source/tutorials/quantization_quick_start_mnist.py.md5
+++ b/docs/source/tutorials/quantization_quick_start_mnist.py.md5
-15b0515a271445cfa4648fe832aa6a43
\ No newline at end of file
+bceaf8235b437428267b614af06634a0
\ No newline at end of file
--- a/docs/source/tutorials/quantization_quick_start_mnist.rst
+++ b/docs/source/tutorials/quantization_quick_start_mnist.rst
@@ -43,7 +43,7 @@ If you are familiar with defining a model and training in pytorch, you can skip
    import torch.nn.functional as F
    from torch.optim import SGD

-    from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device
+    from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt

    # define the model
    model = TorchModel().to(device)
@@ -68,9 +68,9 @@ If you are familiar with defining a model and training in pytorch, you can skip

 .. code-block:: none

-    Average test loss: 0.4043, Accuracy: 8879/10000 (89%)
-    Average test loss: 0.2668, Accuracy: 9212/10000 (92%)
-    Average test loss: 0.1599, Accuracy: 9510/10000 (95%)
+    Average test loss: 0.7073, Accuracy: 7624/10000 (76%)
+    Average test loss: 0.2776, Accuracy: 9122/10000 (91%)
+    Average test loss: 0.1907, Accuracy: 9412/10000 (94%)



@@ -83,7 +83,7 @@ Quantizing Model
 Initialize a `config_list`.
 Detailed about how to write ``config_list`` please refer :doc:`compression config specification <../compression/compression_config_list>`.

-.. GENERATED FROM PYTHON SOURCE LINES 43-62
+.. GENERATED FROM PYTHON SOURCE LINES 43-58

 .. code-block:: default

@@ -91,19 +91,15 @@ Detailed about how to write ``config_list`` please refer :doc:`compression confi
    config_list = [{
        'quant_types': ['input', 'weight'],
        'quant_bits': {'input': 8, 'weight': 8},
-        'op_names': ['conv1']
+        'op_types': ['Conv2d']
    }, {
        'quant_types': ['output'],
        'quant_bits': {'output': 8},
-        'op_names': ['relu1']
+        'op_types': ['ReLU']
    }, {
        'quant_types': ['input', 'weight'],
        'quant_bits': {'input': 8, 'weight': 8},
-        'op_names': ['conv2']
-    }, {
-        'quant_types': ['output'],
-        'quant_bits': {'output': 8},
-        'op_names': ['relu2']
+        'op_names': ['fc1', 'fc2']
    }]


@@ -113,11 +109,11 @@ Detailed about how to write ``config_list`` please refer :doc:`compression confi



-.. GENERATED FROM PYTHON SOURCE LINES 63-64
+.. GENERATED FROM PYTHON SOURCE LINES 59-60

 finetuning the model by using QAT

-.. GENERATED FROM PYTHON SOURCE LINES 64-69
+.. GENERATED FROM PYTHON SOURCE LINES 60-65

 .. code-block:: default

@@ -136,8 +132,6 @@ finetuning the model by using QAT

 .. code-block:: none

-    op_names ['relu1'] not found in model
-    op_names ['relu2'] not found in model

    TorchModel(
      (conv1): QuantizerModuleWrapper(
@@ -146,20 +140,38 @@ finetuning the model by using QAT
      (conv2): QuantizerModuleWrapper(
        (module): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
      )
-      (fc1): Linear(in_features=256, out_features=120, bias=True)
-      (fc2): Linear(in_features=120, out_features=84, bias=True)
+      (fc1): QuantizerModuleWrapper(
+        (module): Linear(in_features=256, out_features=120, bias=True)
+      )
+      (fc2): QuantizerModuleWrapper(
+        (module): Linear(in_features=120, out_features=84, bias=True)
+      )
      (fc3): Linear(in_features=84, out_features=10, bias=True)
+      (relu1): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu2): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu3): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu4): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
+      (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    )



-.. GENERATED FROM PYTHON SOURCE LINES 70-73
+.. GENERATED FROM PYTHON SOURCE LINES 66-69

 The model has now been wrapped, and quantization targets ('quant_types' setting in `config_list`)
 will be quantized & dequantized for simulated quantization in the wrapped layers.
 QAT is a training-aware quantizer, it will update scale and zero point during training.

-.. GENERATED FROM PYTHON SOURCE LINES 73-78
+.. GENERATED FROM PYTHON SOURCE LINES 69-74

 .. code-block:: default

@@ -178,18 +190,18 @@ QAT is a training-aware quantizer, it will update scale and zero point during tr

 .. code-block:: none

-    Average test loss: 0.1332, Accuracy: 9601/10000 (96%)
-    Average test loss: 0.1180, Accuracy: 9657/10000 (97%)
-    Average test loss: 0.0894, Accuracy: 9714/10000 (97%)
+    Average test loss: 0.1542, Accuracy: 9529/10000 (95%)
+    Average test loss: 0.1133, Accuracy: 9664/10000 (97%)
+    Average test loss: 0.0919, Accuracy: 9726/10000 (97%)




-.. GENERATED FROM PYTHON SOURCE LINES 79-80
+.. GENERATED FROM PYTHON SOURCE LINES 75-76

 export model and get calibration_config

-.. GENERATED FROM PYTHON SOURCE LINES 80-85
+.. GENERATED FROM PYTHON SOURCE LINES 76-82

 .. code-block:: default

@@ -202,13 +214,44 @@ export model and get calibration_config



+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    calibration_config:  {'conv1': {'weight_bits': 8, 'weight_scale': tensor([0.0031], device='cuda:0'), 'weight_zero_point': tensor([76.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': -0.4242129623889923, 'tracked_max_input': 2.821486711502075}, 'conv2': {'weight_bits': 8, 'weight_scale': tensor([0.0018], device='cuda:0'), 'weight_zero_point': tensor([113.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 12.42452621459961}, 'fc1': {'weight_bits': 8, 'weight_scale': tensor([0.0011], device='cuda:0'), 'weight_zero_point': tensor([124.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 31.650196075439453}, 'fc2': {'weight_bits': 8, 'weight_scale': tensor([0.0013], device='cuda:0'), 'weight_zero_point': tensor([122.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 25.805370330810547}, 'relu1': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 12.499907493591309}, 'relu2': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 32.0243034362793}, 'relu3': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 26.491384506225586}, 'relu4': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 17.662996292114258}}
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 83-84
+
+build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.
+
+.. GENERATED FROM PYTHON SOURCE LINES 84-90
+
+.. code-block:: default
+
+
+    from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+    input_shape = (32, 1, 28, 28)
+    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+    engine.compress()
+    test_trt(engine)
+
+
+
+
 .. rst-class:: sphx-glr-script-out

 Out:

 .. code-block:: none

-    calibration_config:  {'conv1': {'weight_bits': 8, 'weight_scale': tensor([0.0040], device='cuda:0'), 'weight_zero_point': tensor([84.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': -0.4242129623889923, 'tracked_max_input': 2.821486711502075}, 'conv2': {'weight_bits': 8, 'weight_scale': tensor([0.0017], device='cuda:0'), 'weight_zero_point': tensor([111.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 18.413312911987305}}
+    Loss: 0.09358334274291992  Accuracy: 97.21%
+    Inference elapsed_time (whole dataset): 0.04445981979370117s



@@ -216,7 +259,7 @@ export model and get calibration_config

 .. rst-class:: sphx-glr-timing

-   **Total running time of the script:** ( 1 minutes  46.015 seconds)
+   **Total running time of the script:** ( 1 minutes  36.499 seconds)


 .. _sphx_glr_download_tutorials_quantization_quick_start_mnist.py:

--- a/docs/source/tutorials/quantization_quick_start_mnist_codeobj.pickle
+++ b/docs/source/tutorials/quantization_quick_start_mnist_codeobj.pickle
--- a/docs/source/tutorials/quantization_speedup.ipynb
+++ b/docs/source/tutorials/quantization_speedup.ipynb
@@ -26,7 +26,7 @@
      },
      "outputs": [],
      "source": [
-        "import torch\nimport torch.nn.functional as F\nfrom torch.optim import SGD\nfrom scripts.compression_mnist_model import TorchModel, device, trainer, evaluator, test_trt\n\nconfig_list = [{\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_names': ['conv1']\n}, {\n    'quant_types': ['output'],\n    'quant_bits': {'output': 8},\n    'op_names': ['relu1']\n}, {\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_names': ['conv2']\n}, {\n    'quant_types': ['output'],\n    'quant_bits': {'output': 8},\n    'op_names': ['relu2']\n}]\n\nmodel = TorchModel().to(device)\noptimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)\ncriterion = F.nll_loss\ndummy_input = torch.rand(32, 1, 28,28).to(device)\n\nfrom nni.algorithms.compression.pytorch.quantization import QAT_Quantizer\nquantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)\nquantizer.compress()"
+        "import torch\nimport torch.nn.functional as F\nfrom torch.optim import SGD\nfrom scripts.compression_mnist_model import TorchModel, device, trainer, evaluator, test_trt\n\nconfig_list = [{\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_types': ['Conv2d']\n}, {\n    'quant_types': ['output'],\n    'quant_bits': {'output': 8},\n    'op_types': ['ReLU']\n}, {\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_names': ['fc1', 'fc2']\n}]\n\nmodel = TorchModel().to(device)\noptimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)\ncriterion = F.nll_loss\ndummy_input = torch.rand(32, 1, 28, 28).to(device)\n\nfrom nni.algorithms.compression.pytorch.quantization import QAT_Quantizer\nquantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)\nquantizer.compress()"
      ]
    },
    {
@@ -62,7 +62,7 @@
      },
      "outputs": [],
      "source": [
-        "model_path = \"./log/mnist_model.pth\"\ncalibration_path = \"./log/mnist_calibration.pth\"\ncalibration_config = quantizer.export_model(model_path, calibration_path)\n\nprint(\"calibration_config: \", calibration_config)"
+        "import os\nos.makedirs('log', exist_ok=True)\nmodel_path = \"./log/mnist_model.pth\"\ncalibration_path = \"./log/mnist_calibration.pth\"\ncalibration_config = quantizer.export_model(model_path, calibration_path)\n\nprint(\"calibration_config: \", calibration_config)"
      ]
    },
    {
@@ -80,7 +80,7 @@
      },
      "outputs": [],
      "source": [
-        "# from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT\n# input_shape = (32, 1, 28, 28)\n# engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)\n# engine.compress()\n# test_trt(engine)"
+        "from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT\ninput_shape = (32, 1, 28, 28)\nengine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)\nengine.compress()\ntest_trt(engine)"
      ]
    },
    {
@@ -107,7 +107,7 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.8.8"
+      "version": "3.9.7"
    }
  },
  "nbformat": 4,

--- a/docs/source/tutorials/quantization_speedup.py
+++ b/docs/source/tutorials/quantization_speedup.py
@@ -69,25 +69,21 @@ from scripts.compression_mnist_model import TorchModel, device, trainer, evaluat
 config_list = [{
    'quant_types': ['input', 'weight'],
    'quant_bits': {'input': 8, 'weight': 8},
-    'op_names': ['conv1']
+    'op_types': ['Conv2d']
 }, {
    'quant_types': ['output'],
    'quant_bits': {'output': 8},
-    'op_names': ['relu1']
+    'op_types': ['ReLU']
 }, {
    'quant_types': ['input', 'weight'],
    'quant_bits': {'input': 8, 'weight': 8},
-    'op_names': ['conv2']
-}, {
-    'quant_types': ['output'],
-    'quant_bits': {'output': 8},
-    'op_names': ['relu2']
+    'op_names': ['fc1', 'fc2']
 }]

 model = TorchModel().to(device)
 optimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)
 criterion = F.nll_loss
-dummy_input = torch.rand(32, 1, 28,28).to(device)
+dummy_input = torch.rand(32, 1, 28, 28).to(device)

 from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
 quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
@@ -101,6 +97,8 @@ for epoch in range(3):

 # %%
 # export model and get calibration_config
+import os
+os.makedirs('log', exist_ok=True)
 model_path = "./log/mnist_model.pth"
 calibration_path = "./log/mnist_calibration.pth"
 calibration_config = quantizer.export_model(model_path, calibration_path)
@@ -110,11 +108,11 @@ print("calibration_config: ", calibration_config)
 # %%
 # build tensorRT engine to make a real speedup

-# from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
-# input_shape = (32, 1, 28, 28)
-# engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
-# engine.compress()
-# test_trt(engine)
+from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+input_shape = (32, 1, 28, 28)
+engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+engine.compress()
+test_trt(engine)

 # %%
 # Note that NNI also supports post-training quantization directly, please refer to complete examples for detail.

--- a/docs/source/tutorials/quantization_speedup.py.md5
+++ b/docs/source/tutorials/quantization_speedup.py.md5
-4bdf41a0267e314eb516c84d845c9f7b
\ No newline at end of file
+dc4ece0a2ec69cd5d8b3e7cd9ddeff20
\ No newline at end of file
--- a/docs/source/tutorials/quantization_speedup.rst
+++ b/docs/source/tutorials/quantization_speedup.rst
@@ -77,7 +77,7 @@ Note
 Usage
 -----

-.. GENERATED FROM PYTHON SOURCE LINES 64-96
+.. GENERATED FROM PYTHON SOURCE LINES 64-92

 .. code-block:: default

@@ -89,25 +89,21 @@ Usage
    config_list = [{
        'quant_types': ['input', 'weight'],
        'quant_bits': {'input': 8, 'weight': 8},
-        'op_names': ['conv1']
+        'op_types': ['Conv2d']
    }, {
        'quant_types': ['output'],
        'quant_bits': {'output': 8},
-        'op_names': ['relu1']
+        'op_types': ['ReLU']
    }, {
        'quant_types': ['input', 'weight'],
        'quant_bits': {'input': 8, 'weight': 8},
-        'op_names': ['conv2']
-    }, {
-        'quant_types': ['output'],
-        'quant_bits': {'output': 8},
-        'op_names': ['relu2']
+        'op_names': ['fc1', 'fc2']
    }]

    model = TorchModel().to(device)
    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)
    criterion = F.nll_loss
-    dummy_input = torch.rand(32, 1, 28,28).to(device)
+    dummy_input = torch.rand(32, 1, 28, 28).to(device)

    from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
    quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
@@ -123,8 +119,6 @@ Usage

 .. code-block:: none

-    op_names ['relu1'] not found in model
-    op_names ['relu2'] not found in model

    TorchModel(
      (conv1): QuantizerModuleWrapper(
@@ -133,18 +127,36 @@ Usage
      (conv2): QuantizerModuleWrapper(
        (module): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
      )
-      (fc1): Linear(in_features=256, out_features=120, bias=True)
-      (fc2): Linear(in_features=120, out_features=84, bias=True)
+      (fc1): QuantizerModuleWrapper(
+        (module): Linear(in_features=256, out_features=120, bias=True)
+      )
+      (fc2): QuantizerModuleWrapper(
+        (module): Linear(in_features=120, out_features=84, bias=True)
+      )
      (fc3): Linear(in_features=84, out_features=10, bias=True)
+      (relu1): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu2): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu3): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu4): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
+      (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    )



-.. GENERATED FROM PYTHON SOURCE LINES 97-98
+.. GENERATED FROM PYTHON SOURCE LINES 93-94

 finetuning the model by using QAT

-.. GENERATED FROM PYTHON SOURCE LINES 98-102
+.. GENERATED FROM PYTHON SOURCE LINES 94-98

 .. code-block:: default

@@ -162,21 +174,23 @@ finetuning the model by using QAT

 .. code-block:: none

-    Average test loss: 0.3100, Accuracy: 9056/10000 (91%)
-    Average test loss: 0.1559, Accuracy: 9558/10000 (96%)
-    Average test loss: 0.1031, Accuracy: 9690/10000 (97%)
+    Average test loss: 0.3444, Accuracy: 9141/10000 (91%)
+    Average test loss: 0.1325, Accuracy: 9599/10000 (96%)
+    Average test loss: 0.0980, Accuracy: 9700/10000 (97%)




-.. GENERATED FROM PYTHON SOURCE LINES 103-104
+.. GENERATED FROM PYTHON SOURCE LINES 99-100

 export model and get calibration_config

-.. GENERATED FROM PYTHON SOURCE LINES 104-110
+.. GENERATED FROM PYTHON SOURCE LINES 100-108

 .. code-block:: default

+    import os
+    os.makedirs('log', exist_ok=True)
    model_path = "./log/mnist_model.pth"
    calibration_path = "./log/mnist_calibration.pth"
    calibration_config = quantizer.export_model(model_path, calibration_path)
@@ -193,34 +207,43 @@ export model and get calibration_config

 .. code-block:: none

-    calibration_config:  {'conv1': {'weight_bits': 8, 'weight_scale': tensor([0.0031], device='cuda:0'), 'weight_zero_point': tensor([103.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': -0.4242129623889923, 'tracked_max_input': 2.821486711502075}, 'conv2': {'weight_bits': 8, 'weight_scale': tensor([0.0018], device='cuda:0'), 'weight_zero_point': tensor([111.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 10.046737670898438}}
+    calibration_config:  {'conv1': {'weight_bits': 8, 'weight_scale': tensor([0.0029], device='cuda:0'), 'weight_zero_point': tensor([121.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': -0.4242129623889923, 'tracked_max_input': 2.821486711502075}, 'conv2': {'weight_bits': 8, 'weight_scale': tensor([0.0015], device='cuda:0'), 'weight_zero_point': tensor([109.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 7.498777389526367}, 'fc1': {'weight_bits': 8, 'weight_scale': tensor([0.0009], device='cuda:0'), 'weight_zero_point': tensor([125.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 13.905810356140137}, 'fc2': {'weight_bits': 8, 'weight_scale': tensor([0.0012], device='cuda:0'), 'weight_zero_point': tensor([118.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 12.378301620483398}, 'relu1': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 7.626255035400391}, 'relu2': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 14.335213661193848}, 'relu3': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 12.815309524536133}, 'relu4': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 11.077027320861816}}




-.. GENERATED FROM PYTHON SOURCE LINES 111-112
+.. GENERATED FROM PYTHON SOURCE LINES 109-110

 build tensorRT engine to make a real speedup

-.. GENERATED FROM PYTHON SOURCE LINES 112-119
+.. GENERATED FROM PYTHON SOURCE LINES 110-117

 .. code-block:: default


-    # from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
-    # input_shape = (32, 1, 28, 28)
-    # engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
-    # engine.compress()
-    # test_trt(engine)
+    from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+    input_shape = (32, 1, 28, 28)
+    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+    engine.compress()
+    test_trt(engine)
+
+
+


+.. rst-class:: sphx-glr-script-out
+
+ Out:

+ .. code-block:: none

+    Loss: 0.09857580718994141  Accuracy: 96.96%
+    Inference elapsed_time (whole dataset): 0.044492483139038086s




-.. GENERATED FROM PYTHON SOURCE LINES 120-171
+.. GENERATED FROM PYTHON SOURCE LINES 118-169

 Note that NNI also supports post-training quantization directly, please refer to complete examples for detail.

@@ -277,7 +300,7 @@ input tensor: ``torch.randn(128, 3, 32, 32)``

 .. rst-class:: sphx-glr-timing

-   **Total running time of the script:** ( 0 minutes  55.231 seconds)
+   **Total running time of the script:** ( 0 minutes  59.208 seconds)


 .. _sphx_glr_download_tutorials_quantization_speedup.py:

--- a/docs/source/tutorials/quantization_speedup_codeobj.pickle
+++ b/docs/source/tutorials/quantization_speedup_codeobj.pickle
--- a/examples/tutorials/.gitignore
+++ b/examples/tutorials/.gitignore
 data/
 log/
+*.onnx
\ No newline at end of file
--- a/examples/tutorials/pruning_quick_start_mnist.py
+++ b/examples/tutorials/pruning_quick_start_mnist.py
@@ -89,7 +89,7 @@ for name, mask in masks.items():
 # need to unwrap the model, if the model is wrapped before speedup
 pruner._unwrap_model()

-# speedup the model
+# speedup the model, for more information about speedup, please refer :doc:`pruning_speedup`.
 from nni.compression.pytorch.speedup import ModelSpeedup

 ModelSpeedup(model, torch.rand(3, 1, 28, 28).to(device), masks).speedup_model()

--- a/examples/tutorials/pruning_speedup.py
+++ b/examples/tutorials/pruning_speedup.py
@@ -77,7 +77,7 @@ print('Speedup Model - Elapsed Time : ', time.time() - start)

 # %%
 # For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,
-# please refer to `Pruning Quick Start <./pruning_quick_start_mnist.html>`__.
+# please refer to :doc:`Pruning Quick Start <pruning_quick_start_mnist>`.
 #
 # NOTE: The current implementation supports PyTorch 1.3.1 or newer.
 #
@@ -93,9 +93,9 @@ print('Speedup Model - Elapsed Time : ', time.time() - start)
 # Speedup Results of Examples
 # ---------------------------
 #
-# The code of these experiments can be found :githublink:`here <examples/model_compress/pruning/speedup/model_speedup.py>`.
+# The code of these experiments can be found :githublink:`here <examples/model_compress/pruning/legacy/speedup/model_speedup.py>`.
 #
-# These result are tested on the `legacy pruning framework <../comporession/pruning_legacy>`__, new results will coming soon.
+# These result are tested on the `legacy pruning framework <https://nni.readthedocs.io/en/v2.6/Compression/pruning.html>`_, new results will coming soon.
 #
 # slim pruner example
 # ^^^^^^^^^^^^^^^^^^^

--- a/examples/tutorials/quantization_quick_start_mnist.py
+++ b/examples/tutorials/quantization_quick_start_mnist.py
@@ -19,7 +19,7 @@ import torch
 import torch.nn.functional as F
 from torch.optim import SGD

-from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device
+from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt

 # define the model
 model = TorchModel().to(device)
@@ -44,19 +44,15 @@ for epoch in range(3):
 config_list = [{
    'quant_types': ['input', 'weight'],
    'quant_bits': {'input': 8, 'weight': 8},
-    'op_names': ['conv1']
+    'op_types': ['Conv2d']
 }, {
    'quant_types': ['output'],
    'quant_bits': {'output': 8},
-    'op_names': ['relu1']
+    'op_types': ['ReLU']
 }, {
    'quant_types': ['input', 'weight'],
    'quant_bits': {'input': 8, 'weight': 8},
-    'op_names': ['conv2']
-}, {
-    'quant_types': ['output'],
-    'quant_bits': {'output': 8},
-    'op_names': ['relu2']
+    'op_names': ['fc1', 'fc2']
 }]

 # %%
@@ -82,3 +78,12 @@ calibration_path = "./log/mnist_calibration.pth"
 calibration_config = quantizer.export_model(model_path, calibration_path)

 print("calibration_config: ", calibration_config)
+
+# %%
+# build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.
+
+from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+input_shape = (32, 1, 28, 28)
+engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+engine.compress()
+test_trt(engine)