Resolve conflicts for #4760 (#4762)

a911b856 · Yuge Zhang · GitHub · 14d2966b · a911b856 · a911b856
Unverified Commit a911b856 authored Apr 21, 2022 by Yuge Zhang Committed by GitHub Apr 21, 2022
20 changed files
--- a/docs/source/tutorials/quantization_customize.rst
+++ b/docs/source/tutorials/quantization_customize.rst
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/quantization_customize.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+    .. note::
+        :class: sphx-glr-download-link-note
+
+        Click :ref:`here <sphx_glr_download_tutorials_quantization_customize.py>`
+        to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials_quantization_customize.py:
+
+
+Customize a new quantization algorithm
+======================================
+
+To write a new quantization algorithm, you can write a class that inherits ``nni.compression.pytorch.Quantizer``.
+Then, override the member functions with the logic of your algorithm. The member function to override is ``quantize_weight``.
+``quantize_weight`` directly returns the quantized weights rather than mask, because for quantization the quantized weights cannot be obtained by applying mask.
+
+.. GENERATED FROM PYTHON SOURCE LINES 9-80
+
+.. code-block:: default
+
+
+    from nni.compression.pytorch import Quantizer
+
+    class YourQuantizer(Quantizer):
+        def __init__(self, model, config_list):
+            """
+            Suggest you to use the NNI defined spec for config
+            """
+            super().__init__(model, config_list)
+
+        def quantize_weight(self, weight, config, **kwargs):
+            """
+            quantize should overload this method to quantize weight tensors.
+            This method is effectively hooked to :meth:`forward` of the model.
+
+            Parameters
+            ----------
+            weight : Tensor
+                weight that needs to be quantized
+            config : dict
+                the configuration for weight quantization
+            """
+
+            # Put your code to generate `new_weight` here
+            new_weight = ...
+            return new_weight
+
+        def quantize_output(self, output, config, **kwargs):
+            """
+            quantize should overload this method to quantize output.
+            This method is effectively hooked to `:meth:`forward` of the model.
+
+            Parameters
+            ----------
+            output : Tensor
+                output that needs to be quantized
+            config : dict
+                the configuration for output quantization
+            """
+
+            # Put your code to generate `new_output` here
+            new_output = ...
+            return new_output
+
+        def quantize_input(self, *inputs, config, **kwargs):
+            """
+            quantize should overload this method to quantize input.
+            This method is effectively hooked to :meth:`forward` of the model.
+
+            Parameters
+            ----------
+            inputs : Tensor
+                inputs that needs to be quantized
+            config : dict
+                the configuration for inputs quantization
+            """
+
+            # Put your code to generate `new_input` here
+            new_input = ...
+            return new_input
+
+        def update_epoch(self, epoch_num):
+            pass
+
+        def step(self):
+            """
+            Can do some processing based on the model or weights binded
+            in the func bind_model
+            """
+            pass
+
+
+
+
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 81-87
+
+Customize backward function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sometimes it's necessary for a quantization operation to have a customized backward function,
+such as `Straight-Through Estimator <https://stackoverflow.com/questions/38361314/the-concept-of-straight-through-estimator-ste>`__\ ,
+user can customize a backward function as follow:
+
+.. GENERATED FROM PYTHON SOURCE LINES 87-122
+
+.. code-block:: default
+
+
+    from nni.compression.pytorch.compressor import Quantizer, QuantGrad, QuantType
+
+    class ClipGrad(QuantGrad):
+        @staticmethod
+        def quant_backward(tensor, grad_output, quant_type):
+            """
+            This method should be overrided by subclass to provide customized backward function,
+            default implementation is Straight-Through Estimator
+            Parameters
+            ----------
+            tensor : Tensor
+                input of quantization operation
+            grad_output : Tensor
+                gradient of the output of quantization operation
+            quant_type : QuantType
+                the type of quantization, it can be `QuantType.INPUT`, `QuantType.WEIGHT`, `QuantType.OUTPUT`,
+                you can define different behavior for different types.
+            Returns
+            -------
+            tensor
+                gradient of the input of quantization operation
+            """
+
+            # for quant_output function, set grad to zero if the absolute value of tensor is larger than 1
+            if quant_type == QuantType.OUTPUT:
+                grad_output[tensor.abs() > 1] = 0
+            return grad_output
+
+    class _YourQuantizer(Quantizer):
+        def __init__(self, model, config_list):
+            super().__init__(model, config_list)
+            # set your customized backward function to overwrite default backward function
+            self.quant_grad = ClipGrad
+
+
+
+
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 123-124
+
+If you do not customize ``QuantGrad``, the default backward is Straight-Through Estimator. 
+
+
+.. rst-class:: sphx-glr-timing
+
+   **Total running time of the script:** ( 0 minutes  1.269 seconds)
+
+
+.. _sphx_glr_download_tutorials_quantization_customize.py:
+
+
+.. only :: html
+
+ .. container:: sphx-glr-footer
+    :class: sphx-glr-footer-example
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-python
+
+     :download:`Download Python source code: quantization_customize.py <quantization_customize.py>`
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+     :download:`Download Jupyter notebook: quantization_customize.ipynb <quantization_customize.ipynb>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/docs/source/tutorials/quantization_customize_codeobj.pickle
+++ b/docs/source/tutorials/quantization_customize_codeobj.pickle
--- a/docs/source/tutorials/quantization_quick_start_mnist.ipynb
+++ b/docs/source/tutorials/quantization_quick_start_mnist.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Quantization Quickstart\n\nQuantization reduces model size and speeds up inference time by reducing the number of bits required to represent weights or activations.\n\nIn NNI, both post-training quantization algorithms and quantization-aware training algorithms are supported.\nHere we use `QAT_Quantizer` as an example to show the usage of quantization in NNI.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Preparation\n\nIn this tutorial, we use a simple model and pre-train on MNIST dataset.\nIf you are familiar with defining a model and training in pytorch, you can skip directly to `Quantizing Model`_.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import torch\nimport torch.nn.functional as F\nfrom torch.optim import SGD\n\nfrom scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt\n\n# define the model\nmodel = TorchModel().to(device)\n\n# define the optimizer and criterion for pre-training\n\noptimizer = SGD(model.parameters(), 1e-2)\ncriterion = F.nll_loss\n\n# pre-train and evaluate the model on MNIST dataset\nfor epoch in range(3):\n    trainer(model, optimizer, criterion)\n    evaluator(model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Quantizing Model\n\nInitialize a `config_list`.\nDetailed about how to write ``config_list`` please refer :doc:`compression config specification <../compression/compression_config_list>`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "config_list = [{\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_types': ['Conv2d']\n}, {\n    'quant_types': ['output'],\n    'quant_bits': {'output': 8},\n    'op_types': ['ReLU']\n}, {\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_names': ['fc1', 'fc2']\n}]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "finetuning the model by using QAT\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer\ndummy_input = torch.rand(32, 1, 28, 28).to(device)\nquantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)\nquantizer.compress()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The model has now been wrapped, and quantization targets ('quant_types' setting in `config_list`)\nwill be quantized & dequantized for simulated quantization in the wrapped layers.\nQAT is a training-aware quantizer, it will update scale and zero point during training.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "for epoch in range(3):\n    trainer(model, optimizer, criterion)\n    evaluator(model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "export model and get calibration_config\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "model_path = \"./log/mnist_model.pth\"\ncalibration_path = \"./log/mnist_calibration.pth\"\ncalibration_config = quantizer.export_model(model_path, calibration_path)\n\nprint(\"calibration_config: \", calibration_config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT\ninput_shape = (32, 1, 28, 28)\nengine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)\nengine.compress()\ntest_trt(engine)"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.7"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
--- a/docs/source/tutorials/quantization_quick_start_mnist.py
+++ b/docs/source/tutorials/quantization_quick_start_mnist.py
+"""
+Quantization Quickstart
+=======================
+
+Quantization reduces model size and speeds up inference time by reducing the number of bits required to represent weights or activations.
+
+In NNI, both post-training quantization algorithms and quantization-aware training algorithms are supported.
+Here we use `QAT_Quantizer` as an example to show the usage of quantization in NNI.
+"""
+
+# %%
+# Preparation
+# -----------
+#
+# In this tutorial, we use a simple model and pre-train on MNIST dataset.
+# If you are familiar with defining a model and training in pytorch, you can skip directly to `Quantizing Model`_.
+
+import torch
+import torch.nn.functional as F
+from torch.optim import SGD
+
+from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt
+
+# define the model
+model = TorchModel().to(device)
+
+# define the optimizer and criterion for pre-training
+
+optimizer = SGD(model.parameters(), 1e-2)
+criterion = F.nll_loss
+
+# pre-train and evaluate the model on MNIST dataset
+for epoch in range(3):
+    trainer(model, optimizer, criterion)
+    evaluator(model)
+
+# %%
+# Quantizing Model
+# ----------------
+#
+# Initialize a `config_list`.
+# Detailed about how to write ``config_list`` please refer :doc:`compression config specification <../compression/compression_config_list>`.
+
+config_list = [{
+    'quant_types': ['input', 'weight'],
+    'quant_bits': {'input': 8, 'weight': 8},
+    'op_types': ['Conv2d']
+}, {
+    'quant_types': ['output'],
+    'quant_bits': {'output': 8},
+    'op_types': ['ReLU']
+}, {
+    'quant_types': ['input', 'weight'],
+    'quant_bits': {'input': 8, 'weight': 8},
+    'op_names': ['fc1', 'fc2']
+}]
+
+# %%
+# finetuning the model by using QAT
+from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+dummy_input = torch.rand(32, 1, 28, 28).to(device)
+quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
+quantizer.compress()
+
+# %%
+# The model has now been wrapped, and quantization targets ('quant_types' setting in `config_list`)
+# will be quantized & dequantized for simulated quantization in the wrapped layers.
+# QAT is a training-aware quantizer, it will update scale and zero point during training.
+
+for epoch in range(3):
+    trainer(model, optimizer, criterion)
+    evaluator(model)
+
+# %%
+# export model and get calibration_config
+model_path = "./log/mnist_model.pth"
+calibration_path = "./log/mnist_calibration.pth"
+calibration_config = quantizer.export_model(model_path, calibration_path)
+
+print("calibration_config: ", calibration_config)
+
+# %%
+# build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.
+
+from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+input_shape = (32, 1, 28, 28)
+engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+engine.compress()
+test_trt(engine)
--- a/docs/source/tutorials/quantization_quick_start_mnist.py.md5
+++ b/docs/source/tutorials/quantization_quick_start_mnist.py.md5
+bceaf8235b437428267b614af06634a0
\ No newline at end of file
--- a/docs/source/tutorials/quantization_quick_start_mnist.rst
+++ b/docs/source/tutorials/quantization_quick_start_mnist.rst
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/quantization_quick_start_mnist.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+    .. note::
+        :class: sphx-glr-download-link-note
+
+        Click :ref:`here <sphx_glr_download_tutorials_quantization_quick_start_mnist.py>`
+        to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials_quantization_quick_start_mnist.py:
+
+
+Quantization Quickstart
+=======================
+
+Quantization reduces model size and speeds up inference time by reducing the number of bits required to represent weights or activations.
+
+In NNI, both post-training quantization algorithms and quantization-aware training algorithms are supported.
+Here we use `QAT_Quantizer` as an example to show the usage of quantization in NNI.
+
+.. GENERATED FROM PYTHON SOURCE LINES 12-17
+
+Preparation
+-----------
+
+In this tutorial, we use a simple model and pre-train on MNIST dataset.
+If you are familiar with defining a model and training in pytorch, you can skip directly to `Quantizing Model`_.
+
+.. GENERATED FROM PYTHON SOURCE LINES 17-37
+
+.. code-block:: default
+
+
+    import torch
+    import torch.nn.functional as F
+    from torch.optim import SGD
+
+    from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt
+
+    # define the model
+    model = TorchModel().to(device)
+
+    # define the optimizer and criterion for pre-training
+
+    optimizer = SGD(model.parameters(), 1e-2)
+    criterion = F.nll_loss
+
+    # pre-train and evaluate the model on MNIST dataset
+    for epoch in range(3):
+        trainer(model, optimizer, criterion)
+        evaluator(model)
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    Average test loss: 0.7073, Accuracy: 7624/10000 (76%)
+    Average test loss: 0.2776, Accuracy: 9122/10000 (91%)
+    Average test loss: 0.1907, Accuracy: 9412/10000 (94%)
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 38-43
+
+Quantizing Model
+----------------
+
+Initialize a `config_list`.
+Detailed about how to write ``config_list`` please refer :doc:`compression config specification <../compression/compression_config_list>`.
+
+.. GENERATED FROM PYTHON SOURCE LINES 43-58
+
+.. code-block:: default
+
+
+    config_list = [{
+        'quant_types': ['input', 'weight'],
+        'quant_bits': {'input': 8, 'weight': 8},
+        'op_types': ['Conv2d']
+    }, {
+        'quant_types': ['output'],
+        'quant_bits': {'output': 8},
+        'op_types': ['ReLU']
+    }, {
+        'quant_types': ['input', 'weight'],
+        'quant_bits': {'input': 8, 'weight': 8},
+        'op_names': ['fc1', 'fc2']
+    }]
+
+
+
+
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 59-60
+
+finetuning the model by using QAT
+
+.. GENERATED FROM PYTHON SOURCE LINES 60-65
+
+.. code-block:: default
+
+    from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+    dummy_input = torch.rand(32, 1, 28, 28).to(device)
+    quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
+    quantizer.compress()
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+
+    TorchModel(
+      (conv1): QuantizerModuleWrapper(
+        (module): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
+      )
+      (conv2): QuantizerModuleWrapper(
+        (module): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
+      )
+      (fc1): QuantizerModuleWrapper(
+        (module): Linear(in_features=256, out_features=120, bias=True)
+      )
+      (fc2): QuantizerModuleWrapper(
+        (module): Linear(in_features=120, out_features=84, bias=True)
+      )
+      (fc3): Linear(in_features=84, out_features=10, bias=True)
+      (relu1): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu2): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu3): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu4): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
+      (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
+    )
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 66-69
+
+The model has now been wrapped, and quantization targets ('quant_types' setting in `config_list`)
+will be quantized & dequantized for simulated quantization in the wrapped layers.
+QAT is a training-aware quantizer, it will update scale and zero point during training.
+
+.. GENERATED FROM PYTHON SOURCE LINES 69-74
+
+.. code-block:: default
+
+
+    for epoch in range(3):
+        trainer(model, optimizer, criterion)
+        evaluator(model)
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    Average test loss: 0.1542, Accuracy: 9529/10000 (95%)
+    Average test loss: 0.1133, Accuracy: 9664/10000 (97%)
+    Average test loss: 0.0919, Accuracy: 9726/10000 (97%)
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 75-76
+
+export model and get calibration_config
+
+.. GENERATED FROM PYTHON SOURCE LINES 76-82
+
+.. code-block:: default
+
+    model_path = "./log/mnist_model.pth"
+    calibration_path = "./log/mnist_calibration.pth"
+    calibration_config = quantizer.export_model(model_path, calibration_path)
+
+    print("calibration_config: ", calibration_config)
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    calibration_config:  {'conv1': {'weight_bits': 8, 'weight_scale': tensor([0.0031], device='cuda:0'), 'weight_zero_point': tensor([76.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': -0.4242129623889923, 'tracked_max_input': 2.821486711502075}, 'conv2': {'weight_bits': 8, 'weight_scale': tensor([0.0018], device='cuda:0'), 'weight_zero_point': tensor([113.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 12.42452621459961}, 'fc1': {'weight_bits': 8, 'weight_scale': tensor([0.0011], device='cuda:0'), 'weight_zero_point': tensor([124.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 31.650196075439453}, 'fc2': {'weight_bits': 8, 'weight_scale': tensor([0.0013], device='cuda:0'), 'weight_zero_point': tensor([122.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 25.805370330810547}, 'relu1': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 12.499907493591309}, 'relu2': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 32.0243034362793}, 'relu3': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 26.491384506225586}, 'relu4': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 17.662996292114258}}
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 83-84
+
+build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.
+
+.. GENERATED FROM PYTHON SOURCE LINES 84-90
+
+.. code-block:: default
+
+
+    from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+    input_shape = (32, 1, 28, 28)
+    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+    engine.compress()
+    test_trt(engine)
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    Loss: 0.09358334274291992  Accuracy: 97.21%
+    Inference elapsed_time (whole dataset): 0.04445981979370117s
+
+
+
+
+
+.. rst-class:: sphx-glr-timing
+
+   **Total running time of the script:** ( 1 minutes  36.499 seconds)
+
+
+.. _sphx_glr_download_tutorials_quantization_quick_start_mnist.py:
+
+
+.. only :: html
+
+ .. container:: sphx-glr-footer
+    :class: sphx-glr-footer-example
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-python
+
+     :download:`Download Python source code: quantization_quick_start_mnist.py <quantization_quick_start_mnist.py>`
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+     :download:`Download Jupyter notebook: quantization_quick_start_mnist.ipynb <quantization_quick_start_mnist.ipynb>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/docs/source/tutorials/quantization_quick_start_mnist_codeobj.pickle
+++ b/docs/source/tutorials/quantization_quick_start_mnist_codeobj.pickle
--- a/docs/source/tutorials/quantization_speedup.ipynb
+++ b/docs/source/tutorials/quantization_speedup.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# SpeedUp Model with Calibration Config\n\n\n## Introduction\n\nDeep learning network has been computational intensive and memory intensive \nwhich increases the difficulty of deploying deep neural network model. Quantization is a \nfundamental technology which is widely used to reduce memory footprint and speedup inference \nprocess. Many frameworks begin to support quantization, but few of them support mixed precision \nquantization and get real speedup. Frameworks like `HAQ: Hardware-Aware Automated Quantization with Mixed Precision <https://arxiv.org/pdf/1811.08886.pdf>`__\\, only support simulated mixed precision quantization which will \nnot speedup the inference process. To get real speedup of mixed precision quantization and \nhelp people get the real feedback from hardware, we design a general framework with simple interface to allow NNI quantization algorithms to connect different \nDL model optimization backends (e.g., TensorRT, NNFusion), which gives users an end-to-end experience that after quantizing their model \nwith quantization algorithms, the quantized model can be directly speeded up with the connected optimization backend. NNI connects \nTensorRT at this stage, and will support more backends in the future.\n\n\n## Design and Implementation\n\nTo support speeding up mixed precision quantization, we divide framework into two part, frontend and backend.  \nFrontend could be popular training frameworks such as PyTorch, TensorFlow etc. Backend could be inference \nframework for different hardwares, such as TensorRT. At present, we support PyTorch as frontend and \nTensorRT as backend. To convert PyTorch model to TensorRT engine, we leverage onnx as intermediate graph \nrepresentation. In this way, we convert PyTorch model to onnx model, then TensorRT parse onnx \nmodel to generate inference engine. \n\n\nQuantization aware training combines NNI quantization algorithm 'QAT' and NNI quantization speedup tool.\nUsers should set config to train quantized model using QAT algorithm(please refer to :doc:`NNI Quantization Algorithms <../compression/quantizer>`  ).\nAfter quantization aware training, users can get new config with calibration parameters and model with quantized weight. By passing new config and model to quantization speedup tool, users can get real mixed precision speedup engine to do inference.\n\n\nAfter getting mixed precision engine, users can do inference with input data.\n\n\nNote\n\n\n* Recommend using \"cpu\"(host) as data device(for both inference data and calibration data) since data should be on host initially and it will be transposed to device before inference. If data type is not \"cpu\"(host), this tool will transpose it to \"cpu\" which may increases unnecessary overhead.\n* User can also do post-training quantization leveraging TensorRT directly(need to provide calibration dataset).\n* Not all op types are supported right now. At present, NNI supports Conv, Linear, Relu and MaxPool. More op types will be supported in the following release.\n\n\n## Prerequisite\nCUDA version >= 11.0\n\nTensorRT version >= 7.2\n\nNote\n\n* If you haven't installed TensorRT before or use the old version, please refer to `TensorRT Installation Guide <https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html>`__\\  \n\n## Usage\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import torch\nimport torch.nn.functional as F\nfrom torch.optim import SGD\nfrom scripts.compression_mnist_model import TorchModel, device, trainer, evaluator, test_trt\n\nconfig_list = [{\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_types': ['Conv2d']\n}, {\n    'quant_types': ['output'],\n    'quant_bits': {'output': 8},\n    'op_types': ['ReLU']\n}, {\n    'quant_types': ['input', 'weight'],\n    'quant_bits': {'input': 8, 'weight': 8},\n    'op_names': ['fc1', 'fc2']\n}]\n\nmodel = TorchModel().to(device)\noptimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)\ncriterion = F.nll_loss\ndummy_input = torch.rand(32, 1, 28, 28).to(device)\n\nfrom nni.algorithms.compression.pytorch.quantization import QAT_Quantizer\nquantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)\nquantizer.compress()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "finetuning the model by using QAT\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "for epoch in range(3):\n    trainer(model, optimizer, criterion)\n    evaluator(model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "export model and get calibration_config\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import os\nos.makedirs('log', exist_ok=True)\nmodel_path = \"./log/mnist_model.pth\"\ncalibration_path = \"./log/mnist_calibration.pth\"\ncalibration_config = quantizer.export_model(model_path, calibration_path)\n\nprint(\"calibration_config: \", calibration_config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "build tensorRT engine to make a real speedup\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT\ninput_shape = (32, 1, 28, 28)\nengine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)\nengine.compress()\ntest_trt(engine)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Note that NNI also supports post-training quantization directly, please refer to complete examples for detail.\n\nFor complete examples please refer to :githublink:`the code <examples/model_compress/quantization/mixed_precision_speedup_mnist.py>`.\n\nFor more parameters about the class 'TensorRTModelSpeedUp', you can refer to :doc:`Model Compression API Reference <../reference/compression/quantization_speedup>`.\n\n### Mnist test\n\non one GTX2080 GPU,\ninput tensor: ``torch.randn(128, 1, 28, 28)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - quantization strategy\n     - Latency\n     - accuracy\n   * - all in 32bit\n     - 0.001199961\n     - 96%\n   * - mixed precision(average bit 20.4)\n     - 0.000753688\n     - 96%\n   * - all in 8bit\n     - 0.000229869\n     - 93.7%\n\n### Cifar10 resnet18 test (train one epoch)\n\non one GTX2080 GPU,\ninput tensor: ``torch.randn(128, 3, 32, 32)``\n\n.. list-table::\n   :header-rows: 1\n   :widths: auto\n\n   * - quantization strategy\n     - Latency\n     - accuracy\n   * - all in 32bit\n     - 0.003286268\n     - 54.21%\n   * - mixed precision(average bit 11.55)\n     - 0.001358022\n     - 54.78%\n   * - all in 8bit\n     - 0.000859139\n     - 52.81%\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.7"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
--- a/docs/source/Compression/QuantizationSpeedup.rst
+++ b/docs/source/Compression/QuantizationSpeedup.rst
-Speed up Mixed Precision Quantization Model (experimental)
-==========================================================
+"""
+SpeedUp Model with Calibration Config
+======================================


 Introduction
@@ -7,10 +8,10 @@ Introduction

 Deep learning network has been computational intensive and memory intensive 
 which increases the difficulty of deploying deep neural network model. Quantization is a 
-fundamental technology which is widely used to reduce memory footprint and speed up inference 
+fundamental technology which is widely used to reduce memory footprint and speedup inference 
 process. Many frameworks begin to support quantization, but few of them support mixed precision 
 quantization and get real speedup. Frameworks like `HAQ: Hardware-Aware Automated Quantization with Mixed Precision <https://arxiv.org/pdf/1811.08886.pdf>`__\, only support simulated mixed precision quantization which will 
-not speed up the inference process. To get real speedup of mixed precision quantization and 
+not speedup the inference process. To get real speedup of mixed precision quantization and 
 help people get the real feedback from hardware, we design a general framework with simple interface to allow NNI quantization algorithms to connect different 
 DL model optimization backends (e.g., TensorRT, NNFusion), which gives users an end-to-end experience that after quantizing their model 
 with quantization algorithms, the quantized model can be directly speeded up with the connected optimization backend. NNI connects 
@@ -29,7 +30,7 @@ model to generate inference engine.


 Quantization aware training combines NNI quantization algorithm 'QAT' and NNI quantization speedup tool.
-Users should set config to train quantized model using QAT algorithm(please refer to `NNI Quantization Algorithms <https://nni.readthedocs.io/en/stable/Compression/Quantizer.html>`__\  ).
+Users should set config to train quantized model using QAT algorithm(please refer to :doc:`NNI Quantization Algorithms <../compression/quantizer>`  ).
 After quantization aware training, users can get new config with calibration parameters and model with quantized weight. By passing new config and model to quantization speedup tool, users can get real mixed precision speedup engine to do inference.


@@ -56,87 +57,112 @@ Note

 Usage
 -----
-quantization aware training:

-.. code-block:: python
-
-    # arrange bit config for QAT algorithm
-    configure_list = [{
-            'quant_types': ['weight', 'output'],
-            'quant_bits': {'weight':8, 'output':8},
-            'op_names': ['conv1']
-        }, {
-            'quant_types': ['output'],
-            'quant_bits': {'output':8},
-            'op_names': ['relu1']
-        }
-    ]
-
-    quantizer = QAT_Quantizer(model, configure_list, optimizer)
-    quantizer.compress()
-    calibration_config = quantizer.export_model(model_path, calibration_path)
-
-    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=batch_size)
-    # build tensorrt inference engine
-    engine.compress()
-    # data should be pytorch tensor
-    output, time = engine.inference(data)
-
-
-Note that NNI also supports post-training quantization directly, please refer to complete examples for detail.
-
-
-For complete examples please refer to :githublink:`the code <examples/model_compress/quantization/mixed_precision_speedup_mnist.py>`.
-
-
-For more parameters about the class 'TensorRTModelSpeedUp', you can refer to `Model Compression API Reference <https://nni.readthedocs.io/en/stable/Compression/CompressionReference.html#quantization-speedup>`__\.
-
-
-Mnist test
-^^^^^^^^^^^^^^^^^^^
-
-on one GTX2080 GPU,
-input tensor: ``torch.randn(128, 1, 28, 28)``
-
-.. list-table::
-   :header-rows: 1
-   :widths: auto
-
-   * - quantization strategy
-     - Latency
-     - accuracy
-   * - all in 32bit
-     - 0.001199961
-     - 96%
-   * - mixed precision(average bit 20.4)
-     - 0.000753688
-     - 96%
-   * - all in 8bit
-     - 0.000229869
-     - 93.7%
-
-
-Cifar10 resnet18 test(train one epoch)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-
-on one GTX2080 GPU,
-input tensor: ``torch.randn(128, 3, 32, 32)``
-
-
-.. list-table::
-   :header-rows: 1
-   :widths: auto
-
-   * - quantization strategy
-     - Latency
-     - accuracy
-   * - all in 32bit
-     - 0.003286268
-     - 54.21%
-   * - mixed precision(average bit 11.55)
-     - 0.001358022
-     - 54.78%
-   * - all in 8bit
-     - 0.000859139
-     - 52.81%
\ No newline at end of file
+"""
+
+# %%
+import torch
+import torch.nn.functional as F
+from torch.optim import SGD
+from scripts.compression_mnist_model import TorchModel, device, trainer, evaluator, test_trt
+
+config_list = [{
+    'quant_types': ['input', 'weight'],
+    'quant_bits': {'input': 8, 'weight': 8},
+    'op_types': ['Conv2d']
+}, {
+    'quant_types': ['output'],
+    'quant_bits': {'output': 8},
+    'op_types': ['ReLU']
+}, {
+    'quant_types': ['input', 'weight'],
+    'quant_bits': {'input': 8, 'weight': 8},
+    'op_names': ['fc1', 'fc2']
+}]
+
+model = TorchModel().to(device)
+optimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)
+criterion = F.nll_loss
+dummy_input = torch.rand(32, 1, 28, 28).to(device)
+
+from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
+quantizer.compress()
+
+# %%
+# finetuning the model by using QAT
+for epoch in range(3):
+    trainer(model, optimizer, criterion)
+    evaluator(model)
+
+# %%
+# export model and get calibration_config
+import os
+os.makedirs('log', exist_ok=True)
+model_path = "./log/mnist_model.pth"
+calibration_path = "./log/mnist_calibration.pth"
+calibration_config = quantizer.export_model(model_path, calibration_path)
+
+print("calibration_config: ", calibration_config)
+
+# %%
+# build tensorRT engine to make a real speedup
+
+from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+input_shape = (32, 1, 28, 28)
+engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+engine.compress()
+test_trt(engine)
+
+# %%
+# Note that NNI also supports post-training quantization directly, please refer to complete examples for detail.
+#
+# For complete examples please refer to :githublink:`the code <examples/model_compress/quantization/mixed_precision_speedup_mnist.py>`.
+#
+# For more parameters about the class 'TensorRTModelSpeedUp', you can refer to :doc:`Model Compression API Reference <../reference/compression/quantization_speedup>`.
+#
+# Mnist test
+# ^^^^^^^^^^
+#
+# on one GTX2080 GPU,
+# input tensor: ``torch.randn(128, 1, 28, 28)``
+#
+# .. list-table::
+#    :header-rows: 1
+#    :widths: auto
+#
+#    * - quantization strategy
+#      - Latency
+#      - accuracy
+#    * - all in 32bit
+#      - 0.001199961
+#      - 96%
+#    * - mixed precision(average bit 20.4)
+#      - 0.000753688
+#      - 96%
+#    * - all in 8bit
+#      - 0.000229869
+#      - 93.7%
+#
+# Cifar10 resnet18 test (train one epoch)
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# on one GTX2080 GPU,
+# input tensor: ``torch.randn(128, 3, 32, 32)``
+#
+# .. list-table::
+#    :header-rows: 1
+#    :widths: auto
+#
+#    * - quantization strategy
+#      - Latency
+#      - accuracy
+#    * - all in 32bit
+#      - 0.003286268
+#      - 54.21%
+#    * - mixed precision(average bit 11.55)
+#      - 0.001358022
+#      - 54.78%
+#    * - all in 8bit
+#      - 0.000859139
+#      - 52.81%
--- a/docs/source/tutorials/quantization_speedup.py.md5
+++ b/docs/source/tutorials/quantization_speedup.py.md5
+2404b8d0c3958a0191b77bbe882456e4
\ No newline at end of file
--- a/docs/source/tutorials/quantization_speedup.rst
+++ b/docs/source/tutorials/quantization_speedup.rst
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/quantization_speedup.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+    .. note::
+        :class: sphx-glr-download-link-note
+
+        Click :ref:`here <sphx_glr_download_tutorials_quantization_speedup.py>`
+        to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials_quantization_speedup.py:
+
+
+SpeedUp Model with Calibration Config
+======================================
+
+
+Introduction
+------------
+
+Deep learning network has been computational intensive and memory intensive 
+which increases the difficulty of deploying deep neural network model. Quantization is a 
+fundamental technology which is widely used to reduce memory footprint and speedup inference 
+process. Many frameworks begin to support quantization, but few of them support mixed precision 
+quantization and get real speedup. Frameworks like `HAQ: Hardware-Aware Automated Quantization with Mixed Precision <https://arxiv.org/pdf/1811.08886.pdf>`__\, only support simulated mixed precision quantization which will 
+not speedup the inference process. To get real speedup of mixed precision quantization and 
+help people get the real feedback from hardware, we design a general framework with simple interface to allow NNI quantization algorithms to connect different 
+DL model optimization backends (e.g., TensorRT, NNFusion), which gives users an end-to-end experience that after quantizing their model 
+with quantization algorithms, the quantized model can be directly speeded up with the connected optimization backend. NNI connects 
+TensorRT at this stage, and will support more backends in the future.
+
+
+Design and Implementation
+-------------------------
+
+To support speeding up mixed precision quantization, we divide framework into two part, frontend and backend.  
+Frontend could be popular training frameworks such as PyTorch, TensorFlow etc. Backend could be inference 
+framework for different hardwares, such as TensorRT. At present, we support PyTorch as frontend and 
+TensorRT as backend. To convert PyTorch model to TensorRT engine, we leverage onnx as intermediate graph 
+representation. In this way, we convert PyTorch model to onnx model, then TensorRT parse onnx 
+model to generate inference engine. 
+
+
+Quantization aware training combines NNI quantization algorithm 'QAT' and NNI quantization speedup tool.
+Users should set config to train quantized model using QAT algorithm(please refer to :doc:`NNI Quantization Algorithms <../compression/quantizer>`  ).
+After quantization aware training, users can get new config with calibration parameters and model with quantized weight. By passing new config and model to quantization speedup tool, users can get real mixed precision speedup engine to do inference.
+
+
+After getting mixed precision engine, users can do inference with input data.
+
+
+Note
+
+
+* Recommend using "cpu"(host) as data device(for both inference data and calibration data) since data should be on host initially and it will be transposed to device before inference. If data type is not "cpu"(host), this tool will transpose it to "cpu" which may increases unnecessary overhead.
+* User can also do post-training quantization leveraging TensorRT directly(need to provide calibration dataset).
+* Not all op types are supported right now. At present, NNI supports Conv, Linear, Relu and MaxPool. More op types will be supported in the following release.
+
+
+Prerequisite
+------------
+CUDA version >= 11.0
+
+TensorRT version >= 7.2
+
+Note
+
+* If you haven't installed TensorRT before or use the old version, please refer to `TensorRT Installation Guide <https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html>`__\  
+
+Usage
+-----
+
+.. GENERATED FROM PYTHON SOURCE LINES 64-92
+
+.. code-block:: default
+
+    import torch
+    import torch.nn.functional as F
+    from torch.optim import SGD
+    from scripts.compression_mnist_model import TorchModel, device, trainer, evaluator, test_trt
+
+    config_list = [{
+        'quant_types': ['input', 'weight'],
+        'quant_bits': {'input': 8, 'weight': 8},
+        'op_types': ['Conv2d']
+    }, {
+        'quant_types': ['output'],
+        'quant_bits': {'output': 8},
+        'op_types': ['ReLU']
+    }, {
+        'quant_types': ['input', 'weight'],
+        'quant_bits': {'input': 8, 'weight': 8},
+        'op_names': ['fc1', 'fc2']
+    }]
+
+    model = TorchModel().to(device)
+    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)
+    criterion = F.nll_loss
+    dummy_input = torch.rand(32, 1, 28, 28).to(device)
+
+    from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+    quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
+    quantizer.compress()
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+
+    TorchModel(
+      (conv1): QuantizerModuleWrapper(
+        (module): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
+      )
+      (conv2): QuantizerModuleWrapper(
+        (module): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
+      )
+      (fc1): QuantizerModuleWrapper(
+        (module): Linear(in_features=256, out_features=120, bias=True)
+      )
+      (fc2): QuantizerModuleWrapper(
+        (module): Linear(in_features=120, out_features=84, bias=True)
+      )
+      (fc3): Linear(in_features=84, out_features=10, bias=True)
+      (relu1): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu2): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu3): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (relu4): QuantizerModuleWrapper(
+        (module): ReLU()
+      )
+      (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
+      (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
+    )
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 93-94
+
+finetuning the model by using QAT
+
+.. GENERATED FROM PYTHON SOURCE LINES 94-98
+
+.. code-block:: default
+
+    for epoch in range(3):
+        trainer(model, optimizer, criterion)
+        evaluator(model)
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    Average test loss: 0.5386, Accuracy: 8619/10000 (86%)
+    Average test loss: 0.1553, Accuracy: 9521/10000 (95%)
+    Average test loss: 0.1001, Accuracy: 9686/10000 (97%)
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 99-100
+
+export model and get calibration_config
+
+.. GENERATED FROM PYTHON SOURCE LINES 100-108
+
+.. code-block:: default
+
+    import os
+    os.makedirs('log', exist_ok=True)
+    model_path = "./log/mnist_model.pth"
+    calibration_path = "./log/mnist_calibration.pth"
+    calibration_config = quantizer.export_model(model_path, calibration_path)
+
+    print("calibration_config: ", calibration_config)
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    calibration_config:  {'conv1': {'weight_bits': 8, 'weight_scale': tensor([0.0029], device='cuda:0'), 'weight_zero_point': tensor([98.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': -0.4242129623889923, 'tracked_max_input': 2.821486711502075}, 'conv2': {'weight_bits': 8, 'weight_scale': tensor([0.0017], device='cuda:0'), 'weight_zero_point': tensor([124.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 8.848002433776855}, 'fc1': {'weight_bits': 8, 'weight_scale': tensor([0.0010], device='cuda:0'), 'weight_zero_point': tensor([134.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 14.64758586883545}, 'fc2': {'weight_bits': 8, 'weight_scale': tensor([0.0013], device='cuda:0'), 'weight_zero_point': tensor([121.], device='cuda:0'), 'input_bits': 8, 'tracked_min_input': 0.0, 'tracked_max_input': 15.807988166809082}, 'relu1': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 9.041301727294922}, 'relu2': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 15.143928527832031}, 'relu3': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 16.151935577392578}, 'relu4': {'output_bits': 8, 'tracked_min_output': 0.0, 'tracked_max_output': 11.749024391174316}}
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 109-110
+
+build tensorRT engine to make a real speedup
+
+.. GENERATED FROM PYTHON SOURCE LINES 110-117
+
+.. code-block:: default
+
+
+    from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+    input_shape = (32, 1, 28, 28)
+    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+    engine.compress()
+    test_trt(engine)
+
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    Loss: 0.10061546401977539  Accuracy: 96.83%
+    Inference elapsed_time (whole dataset): 0.04322671890258789s
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 118-169
+
+Note that NNI also supports post-training quantization directly, please refer to complete examples for detail.
+
+For complete examples please refer to :githublink:`the code <examples/model_compress/quantization/mixed_precision_speedup_mnist.py>`.
+
+For more parameters about the class 'TensorRTModelSpeedUp', you can refer to :doc:`Model Compression API Reference <../reference/compression/quantization_speedup>`.
+
+Mnist test
+^^^^^^^^^^
+
+on one GTX2080 GPU,
+input tensor: ``torch.randn(128, 1, 28, 28)``
+
+.. list-table::
+   :header-rows: 1
+   :widths: auto
+
+   * - quantization strategy
+     - Latency
+     - accuracy
+   * - all in 32bit
+     - 0.001199961
+     - 96%
+   * - mixed precision(average bit 20.4)
+     - 0.000753688
+     - 96%
+   * - all in 8bit
+     - 0.000229869
+     - 93.7%
+
+Cifar10 resnet18 test (train one epoch)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+on one GTX2080 GPU,
+input tensor: ``torch.randn(128, 3, 32, 32)``
+
+.. list-table::
+   :header-rows: 1
+   :widths: auto
+
+   * - quantization strategy
+     - Latency
+     - accuracy
+   * - all in 32bit
+     - 0.003286268
+     - 54.21%
+   * - mixed precision(average bit 11.55)
+     - 0.001358022
+     - 54.78%
+   * - all in 8bit
+     - 0.000859139
+     - 52.81%
+
+
+.. rst-class:: sphx-glr-timing
+
+   **Total running time of the script:** ( 1 minutes  4.509 seconds)
+
+
+.. _sphx_glr_download_tutorials_quantization_speedup.py:
+
+
+.. only :: html
+
+ .. container:: sphx-glr-footer
+    :class: sphx-glr-footer-example
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-python
+
+     :download:`Download Python source code: quantization_speedup.py <quantization_speedup.py>`
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+     :download:`Download Jupyter notebook: quantization_speedup.ipynb <quantization_speedup.ipynb>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/docs/source/tutorials/quantization_speedup_codeobj.pickle
+++ b/docs/source/tutorials/quantization_speedup_codeobj.pickle
--- a/docs/source/tutorials/sg_execution_times.rst
+++ b/docs/source/tutorials/sg_execution_times.rst
@@ -5,10 +5,22 @@

 Computation times
 =================
-**00:24.663** total execution time for **tutorials** files:
+**01:04.509** total execution time for **tutorials** files:

-+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorials_nni_experiment.py` (``nni_experiment.py``)               | 00:24.662 | 0.0 MB |
-+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorials_nas_quick_start_mnist.py` (``nas_quick_start_mnist.py``) | 00:00.002 | 0.0 MB |
-+-----------------------------------------------------------------------------------+-----------+--------+
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_quantization_speedup.py` (``quantization_speedup.py``)                     | 01:04.509 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_hello_nas.py` (``hello_nas.py``)                                           | 00:00.000 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_nasbench_as_dataset.py` (``nasbench_as_dataset.py``)                       | 00:00.000 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_pruning_customize.py` (``pruning_customize.py``)                           | 00:00.000 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_pruning_quick_start_mnist.py` (``pruning_quick_start_mnist.py``)           | 00:00.000 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_pruning_speedup.py` (``pruning_speedup.py``)                               | 00:00.000 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_quantization_customize.py` (``quantization_customize.py``)                 | 00:00.000 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorials_quantization_quick_start_mnist.py` (``quantization_quick_start_mnist.py``) | 00:00.000 | 0.0 MB |
+-----------------------------------------------------------------------------------------------------+-----------+--------+
--- a/docs/static/css/material_custom.css
+++ b/docs/static/css/material_custom.css
+/* Global font */
+body, input {
+    font-family: "Roboto", "Noto Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+}
+
+code, kbd, pre {
+    font-family: "Roboto Mono", "Consolas", "Courier New", Courier, monospace;
+}
+
+h1, h2, h3, h4, .md-header, .md-tabs, .md-hero {
+    font-family: "Google Sans", "Noto Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+}
+
+/* Title font */
+@media only screen and (min-width: 45em) {
+    .md-header-nav__title {
+        font-weight: 500;
+    }
+}
+
+.md-typeset h4, .md-typeset h5, .md-typeset h6 {
+    font-weight: 600;
+}
+
 /* viewcode link should have left padding */
 span.viewcode-link {
    padding-left: 0.6rem;
@@ -8,24 +32,55 @@ dt.sig-object {
    position: relative;
    background: #f4f5f7;
    padding: 0.5rem;
-    border-left: 0.2rem solid #ec407a;  /* this should be matched with theme color. */
+    border-left: 0.2rem solid var(--custom-color-accent);
    word-wrap: break-word;
 }

+.class > dt.sig-object {
+    border-left: none;                    /* remove left border */
+    border-top: 0.18rem solid var(--custom-color-accent);
+}
+
+.function > dt.sig-object {
+    border-left: none;                    /* remove left border */
+    border-top: 0.18rem solid var(--custom-color-accent);
+}
+
+.exception > dt.sig-object {
+    border-left: none;                    /* remove left border */
+    border-top: 0.18rem solid var(--custom-color-accent);
+}
+
+/* Padding on parameter list is not needed */
+dl.field-list > dt {
+    padding-left: 0 !important;
+}
+
+dl.field-list > dd {
+    margin-left: 1.5em;
+}
+
+/* show headerlink when hover/focus */
+dt.sig-object:focus .headerlink, dt.sig-object:hover .headerlink {
+    -webkit-transform: translate(0);
+    transform: translate(0);
+    opacity: 1;
+}
+
 /* logo is too large */
 a.md-logo img {
    padding: 3px;
 }

-/* Add split for navigation */
+/* Split for navigation */
 nav.md-tabs .md-tabs__item:not(:last-child) {
    padding-right: 0;
 }

-nav.md-tabs .md-tabs__item:not(:last-child) .md-tabs__link:after {
-    content: "»";
-    font-family: "Material Icons";
-    padding-left: 0.6rem;    
+nav.md-tabs .md-tabs__arrow {
+    padding-left: .3rem;
+    font-size: inherit;
+    vertical-align: -10%;
 }

 /* hide the floating button generated by readthedocs */
@@ -44,11 +99,388 @@ nav.md-tabs .md-tabs__item:not(:last-child) .md-tabs__link:after {
 }

 /* toc style */
-.md-nav span.caption {
+li.md-nav__item:not(:first-child) span.caption {
    margin-top: 1.25em;
 }

+@media only screen and (min-width: 76.2em) {
+    .md-nav--primary .md-nav__title--site {
+        display: none;
+    }
+}
+
+.md-nav__overview {
+    font-weight: 500;
+}
+
+@media only screen and (max-width: 76.1875em) {
+    .md-nav__overview {
+        display: none;
+    }
+}
+
+/* hide nav bar in some cases */
+.md-tabs.hidden {
+    display: none;
+}
+
 /* citation style */
 .citation dt {
    padding-right: 1em;
 }
+
+/* inline code style */
+.md-typeset code {
+    padding-left: 0.07em;
+    padding-right: 0.07em;
+}
+
+/* for release icon, on home page */
+.release-icon {
+    margin-left: 8px;
+    width: 40px;
+}
+
+/* Similar to cardlink, but used in codesnippet in index page. see sphinx_gallery.css */
+.codesnippet-card-container {
+    display: flex;
+    flex-flow: wrap row;
+}
+
+.codesnippet-card.admonition {
+    border-left: 0;
+    padding: 0;
+    margin: .5rem 1rem 1rem 0rem;
+    width: 100%;
+}
+
+/* Controlling the cards in containers only */
+.codesnippet-card-container .codesnippet-card.admonition {
+    width: 47%;
+}
+
+@media only screen and (max-width:59.9375em) {
+    .codesnippet-card-container .codesnippet-card.admonition {
+        width: 100%;
+    }
+}
+
+.codesnippet-card .codesnippet-card-body {
+    min-height: 4rem;
+    position: relative;
+    padding: 0.9rem 0.9rem 3rem 0.9rem;
+}
+
+.codesnippet-card .codesnippet-card-footer {
+    padding: 0.8rem 0.9rem;
+    border-top: 1px solid #ddd;
+    margin: 0 !important;
+    position: absolute;
+    bottom: 0;
+    width: 100%;
+}
+
+.codesnippet-card a:not(:hover) {
+    color: rgba(0, 0, 0, .68);
+}
+
+.codesnippet-card-title-container {
+    margin-top: 0.3rem;
+    position: relative;
+}
+
+.codesnippet-card-title-container h4 {
+    padding-left: 2.3rem;
+    line-height: 1.6rem;
+    height: 1.6rem;
+    margin-top: 0;
+}
+
+.codesnippet-card-icon {
+    position: absolute;
+    top: 0;
+    left: 0;
+}
+
+.codesnippet-card-icon img {
+    max-width: 100%;
+    max-height: 100%;
+    /* horizontal and vertical center */
+    /* https://stackoverflow.com/questions/7273338/how-to-vertically-align-an-image-inside-a-div */
+    text-align: center;
+    vertical-align: middle;
+    position: absolute;
+    left: 0;
+    right: 0;
+    top: 0;
+    bottom: 0;
+    margin: auto;
+}
+
+.codesnippet-card-icon {
+    width: 1.6rem;
+    height: 1.6rem;
+    padding: 0;
+}
+
+.codesnippet-card-link {
+    position: relative;
+}
+
+.codesnippet-card-link .material-icons {
+    position: absolute;
+    right: 0;
+}
+
+/* fixes reference overlapping issue */
+/* This is originally defined to be negative in application_fixes.css */
+/* They did that to ensure the header doesn't disappear in jump links */
+/* We did this by using scroll-margin-top instead */
+dt:target {
+    margin-top: 0.15rem !important;
+    padding-top: 0.5rem !important;
+}
+
+:target {
+    /* header height */
+    scroll-margin-top: 3.5rem;
+}
+
+/* fix code block style on mobile */
+@media only screen and (max-width: 44.9375em) {
+    .md-typeset pre {
+        margin: 1em -0.3em;
+    }
+}
+
+/* Responsive nav bar */
+.md-source__fact {
+    padding: 0 !important;
+}
+
+/* collapsible toctree */
+.md-nav--primary ul li {
+    padding-left: .8rem;
+}
+
+.md-nav__item {
+    position: relative;
+}
+
+.md-nav__expand > a > .md-nav__tocarrow {
+    transform: rotate(-90deg);
+    font-size: inherit;
+    transition: all 0.1s ease;
+    position: absolute;
+    left: .1rem;
+    top: .05rem;
+}
+
+.md-nav__expand .md-nav__list {
+    display: none;
+}
+
+.md-nav__expand--active > .md-nav__list {
+    display: block;
+}
+
+.md-nav__expand--active > a > .md-nav__tocarrow {
+    transform: rotate(0);
+}
+
+@media only screen and (max-width:76.1875em) {
+    .md-nav--primary .md-nav__link {
+        padding: .15rem .2rem .15rem .6rem;
+    }
+    .md-nav__expand > a > .md-nav__tocarrow {
+        left: 0;
+        top: .25rem;
+    }
+    .md-nav--primary span.md-nav__link.caption {
+        margin-top: 0.75em;
+    }
+    .md-nav--primary .md-nav__item .md-nav__list .md-nav__item {
+        padding-left: .3rem;
+    }
+    html .md-nav--primary .md-nav__title--site .md-nav__button {
+        height: auto;
+        font-size: inherit;
+        left: 0;
+    }
+    html .md-nav--primary .md-nav__title {
+        padding-top: 2rem;
+        padding-left: .6rem;
+        height: 4.6rem;
+    }
+    .md-nav--primary .md-nav__item, .md-nav--primary .md-nav__title {
+        font-size: .7rem;
+        line-height: 1.3;
+    }
+}
+
+/* Increase TOC padding */
+.md-nav--primary ul li ul li {
+    padding-left: 0.8rem;
+}
+
+/* Nav bar and heroes */
+@media only screen and (min-width:60em) {
+    .md-search__form, .md-search__input {
+        border-radius: .3rem;   /* even rounder */
+    }
+}
+
+.md-header-nav__source, .md-source {
+    padding-right: 0 !important;
+}
+
+.md-hero {
+    position: relative;
+}
+
+.md-hero__background {
+    max-width: 73rem;
+    position: absolute;
+    bottom: -46px;
+    left: 0;
+    right: 0;
+    margin-left: auto;
+    margin-right: auto;
+    width: 100%;
+    z-index: 0;
+}
+
+.md-hero__background img {
+    width: 100%;
+}
+
+@media only screen and (max-width:59.9375em) {
+    .md-hero__background {
+        display: none;
+    }
+}
+
+@media only screen and (max-width:76.1875em) {
+    .md-hero__background {
+        bottom: -5%;
+        top: auto;
+    }
+}
+
+.md-hero__inner {
+    z-index: 1;
+    position: relative;
+    padding-right: 35%;
+}
+
+@media only screen and (min-width:76.2em) {
+    .md-hero__inner {
+        padding-top: 2.4rem;
+        padding-bottom: 1.2rem;
+    }
+}
+
+/* make title look larger */
+.md-typeset h1 {
+    margin: 0 0 1.5rem;
+    color: rgba(0,0,0,.85);
+    font-size: 1.5625rem;
+    line-height: 1.3;
+}
+
+.md-typeset h1, .md-typeset h2, .md-typeset h3 {
+    font-weight: 400;
+    letter-spacing: 0;
+}
+
+/* Enlarge table */
+.md-typeset table:not([class]) {
+    font-size: 0.7rem;
+    box-shadow: 0 2px 2px 0 rgb(0 0 0 / 8%), 0 1px 5px 0 rgb(0 0 0 / 7%), 0 3px 1px -2px rgb(0 0 0 / 14%);
+}
+
+.md-typeset table:not([class]) th {
+    padding: .5rem .7rem;
+    background-color: #e6e7e8;
+    color: inherit;
+    font-weight: 500;
+}
+
+.md-typeset table:not([class]) td {
+    padding: .4rem .7rem;
+}
+
+/* On this page TOC */
+.md-sidebar--secondary .md-nav--secondary {
+    border-inline-start: 4px solid var(--custom-color-primary);
+}
+
+.md-nav__link {
+    margin-top: 0.45em;
+}
+
+/* Override style for copy button */
+button.copybtn {
+    opacity: 1;
+}
+
+.o-tooltip--left:after {
+    transform: translateX(-5%) translateY(-125%);
+    padding: .4em;
+    font-size: .5rem;
+    font-weight: 600;
+    background: #5f6368;
+}
+
+.o-tooltip--left:hover:after {
+    transform: translateX(-5%) translateY(-120%);
+}
+
+/* Sphinx tabs */
+/* Copied from https://github.com/executablebooks/sphinx-tabs/blob/master/sphinx_tabs/static/tabs.css with modifications */
+.sphinx-tabs.container {
+    margin-bottom: 1rem;
+    border: 1px solid rgb(232, 234, 237);
+    border-radius: 8px;
+}
+
+[role="tablist"] {
+    padding: .3rem 0 0 0;
+    border-bottom: 1px solid #a0b3bf;
+}
+
+.sphinx-tabs-tab {
+    position: relative;
+    line-height: 2rem;
+    font-weight: 600;
+    padding: 0 1rem;
+    color: #80868b;
+}
+
+.sphinx-tabs-tab[aria-selected="true"] {
+    color: #3f51b5;                     /* primary color */
+    border-bottom: 2px solid #3f51b5;
+}
+
+.sphinx-tabs-tab:focus {
+    z-index: 1;
+    outline-offset: 1px;
+}
+
+.sphinx-tabs-panel {
+    position: relative;
+    padding: 1rem;
+}
+
+.sphinx-tabs-panel.code-tab {
+    padding: 0;
+}
+
+.sphinx-tabs-panel.code-tab .highlight {
+    margin: 0;
+    padding: .5rem;
+}
+
+.sphinx-tab img {
+    margin-bottom: 2rem;
+}
--- a/docs/static/css/material_dropdown.css
+++ b/docs/static/css/material_dropdown.css
 /* https://codepen.io/mildrenben/pen/RPwQEY */

+@media only screen and (max-width:44.9375em) {
+    .drop {
+        display: none;
+    }
+}
+
 .drop {
-    width: 125px;
+    width: 5.3rem;
    vertical-align: middle;
 }

+@media only screen and (min-width:60em) and (max-width:70em) {
+    .drop {
+        width: 4.7rem;
+    }
+    /* also narrow nav source width */
+    .md-header-nav__source {
+        width: 10rem;
+    }
+}
+
 .drop button {
    color: inherit;
    font-weight: 700;

--- a/docs/static/css/material_theme.css
+++ b/docs/static/css/material_theme.css
+/* Theme related customization */
+/* https://github.com/bashtage/sphinx-material/pull/122 */
+
+/* first part */
+button[data-md-color-primary=custom] {
+    background-color: var(--custom-color-primary)
+}
+
+[data-md-color-primary=custom] .md-typeset a {
+    color: var(--custom-color-primary)
+}
+
+[data-md-color-primary=custom] .md-header,
+[data-md-color-primary=custom] .md-hero {
+    background-color: var(--custom-color-primary)
+}
+
+[data-md-color-primary=custom] .md-nav__link--active,
+[data-md-color-primary=custom] .md-nav__link:active {
+    color: var(--custom-color-primary)
+}
+
+[data-md-color-primary=custom] .md-nav__item--nested>.md-nav__link {
+    color: inherit
+}
+
+[data-md-color-primary=custom] .md-nav__extra_link:active {
+    color: var(--custom-color-primary)
+}
+
+[data-md-color-primary=custom] .md-nav__item--nested > .md-nav__extra_link {
+    color: inherit
+}
+
+/* second part */
+button[data-md-color-accent=custom] {
+    background-color: var(--custom-color-accent)
+}
+
+[data-md-color-accent=custom] .md-typeset a:active,
+[data-md-color-accent=custom] .md-typeset a:hover {
+    color: var(--custom-color-accent)
+}
+
+[data-md-color-accent=custom] .md-typeset .codehilite pre::-webkit-scrollbar-thumb:hover,
+[data-md-color-accent=custom] .md-typeset pre code::-webkit-scrollbar-thumb:hover {
+    background-color: var(--custom-color-accent)
+}
+
+[data-md-color-accent=custom] .md-nav__link:focus,
+[data-md-color-accent=custom] .md-nav__link:hover,
+[data-md-color-accent=custom] .md-typeset .footnote li:hover .footnote-backref:hover,
+[data-md-color-accent=custom] .md-typeset .footnote li:target .footnote-backref,
+[data-md-color-accent=custom] .md-typeset .md-clipboard:active:before,
+[data-md-color-accent=custom] .md-typeset .md-clipboard:hover:before,
+[data-md-color-accent=custom] .md-typeset [id] .headerlink:focus,
+[data-md-color-accent=custom] .md-typeset [id]:hover .headerlink:hover,
+[data-md-color-accent=custom] .md-typeset [id]:target .headerlink {
+    color: var(--custom-color-accent)
+}
+
+[data-md-color-accent=custom] .md-search__scrollwrap::-webkit-scrollbar-thumb:hover {
+    background-color: var(--custom-color-accent)
+}
+
+[data-md-color-accent=custom] .md-search-result__link:hover,
+[data-md-color-accent=custom] .md-search-result__link[data-md-state=active] {
+    background-color: rgba(83, 109, 254, .1)
+}
+
+[data-md-color-accent=custom] .md-sidebar__scrollwrap::-webkit-scrollbar-thumb:hover {
+    background-color: var(--custom-color-accent)
+}
+
+[data-md-color-accent=custom] .md-source-file:hover:before {
+    background-color: var(--custom-color-accent)
+}
+
+/* third part */
+@media only screen and (max-width:59.9375em) {
+    [data-md-color-primary=custom] .md-nav__source {
+        background-color: var(--custom-color-primary);
+        opacity: 0.9675;
+    }
+}
+
+@media only screen and (max-width:76.1875em) {
+    html [data-md-color-primary=custom] .md-nav--primary .md-nav__title--site {
+        background-color: var(--custom-color-primary)
+    }
+}
+
+@media only screen and (min-width:76.25em) {
+    [data-md-color-primary=custom] .md-tabs {
+        background-color: var(--custom-color-primary)
+    }
+}
--- a/docs/static/css/sphinx_gallery.css
+++ b/docs/static/css/sphinx_gallery.css
@@ -11,7 +11,7 @@ p.sphx-glr-script-out {
 }

 .sphx-glr-script-out .highlight pre {
-    background-color: #f2f3fa !important;
+    background-color: #f0f6fa !important;
    padding: .6rem !important;
    margin-bottom: 1.5rem;
 }
@@ -26,6 +26,7 @@ div.sphx-glr-footer {

 .sphx-glr-download-link-note {
    margin-bottom: 1.5rem;
+    line-height: 2.5rem;
 }

 .notebook-action-div {
@@ -37,7 +38,7 @@ div.sphx-glr-footer {

 .notebook-action-link:hover .notebook-action-div {
    /* match theme */
-    border-bottom-color: #f50057;
+    border-bottom-color: var(--custom-color-accent);
 }

 .notebook-action-link img {
@@ -83,6 +84,11 @@ div.sphx-glr-footer {
    font-size: 0.75rem;
 }

+/* hide link */
+.card-link-anchor {
+    display: none;
+}
+
 .card-link-tag {
    margin-right: 0.4rem;
    background: #eeeff2;
@@ -103,8 +109,8 @@ div.sphx-glr-footer {
 }

 .card-link-icon img {
-    max-width: 80%;
-    max-height: 80%;
+    max-width: 75%;
+    max-height: 75%;
    /* horizontal and vertical center */
    /* https://stackoverflow.com/questions/7273338/how-to-vertically-align-an-image-inside-a-div */
    text-align: center;
@@ -119,7 +125,7 @@ div.sphx-glr-footer {

 /* link icon background color */
 .card-link-icon.circle {
-    background-color: #283593;
+    background-color: #E8DCEE;
    border-radius: 50%;
    width: 4rem;
    height: 4rem;
@@ -127,50 +133,18 @@ div.sphx-glr-footer {
 }

 /* pallette */
-.card-link-icon.red {
-    background-color: #C62828;
-}
-
-.card-link-icon.pink {
-    background-color: #AD1457;
+.card-link-icon.cyan {
+    background-color: #DDEFF2;
 }

 .card-link-icon.purple {
-    background-color: #8E24AA;
-}
-
-.card-link-icon.deep-purple {
-    background-color: #512DA8;
+    background-color: #E8DCEE;
 }

 .card-link-icon.blue {
-    background-color: #1565C0;
-}
-
-.card-link-icon.light-blue {
-    background-color: #0277BD;
-}
-
-.card-link-icon.cyan {
-    background-color: #006064;
-}
-
-.card-link-icon.teal {
-    background-color: #00796B;
-}
-
-.card-link-icon.green {
-    background-color: #2E7D32;
-}
-
-.card-link-icon.deep-orange {
-    background-color: #BF360C;
-}
-
-.card-link-icon.brown {
-    background-color: #6D4C41;
+    background-color: #DBE8FC;
 }

 .card-link-icon.indigo {
-    background-color: #3949AB;
+    background-color: #E1E4F3;
 }
--- a/docs/static/img/Comfort.png
+++ b/docs/static/img/Comfort.png
--- a/docs/static/img/Crying.png
+++ b/docs/static/img/Crying.png
--- a/docs/static/img/Cut.png
+++ b/docs/static/img/Cut.png