Resolve conflicts for #4760 (#4762)

a911b856 · Yuge Zhang · GitHub · 14d2966b · 14d2966b · a911b856
Unverified Commit a911b856 authored Apr 21, 2022 by Yuge Zhang Committed by GitHub Apr 21, 2022
20 changed files
--- a/examples/tutorials/nni_experiment.py
+++ b/examples/tutorials/nni_experiment.py
-"""
-Start and Manage a New Experiment
-=================================
-"""
-# %%
-# Configure Search Space
-# ----------------------
-search_space = {
-    "C": {"_type": "quniform", "_value": [0.1, 1, 0.1]},
-    "kernel": {"_type": "choice", "_value": ["linear", "rbf", "poly", "sigmoid"]},
-    "degree": {"_type": "choice", "_value": [1, 2, 3, 4]},
-    "gamma": {"_type": "quniform", "_value": [0.01, 0.1, 0.01]},
-    "coef0": {"_type": "quniform", "_value": [0.01, 0.1, 0.01]}
-}
-# %%
-# Configure Experiment
-# --------------------
-from nni.experiment import Experiment
-experiment = Experiment('local')
-experiment.config.experiment_name = 'Example'
-experiment.config.trial_concurrency = 2
-experiment.config.max_trial_number = 10
-experiment.config.search_space = search_space
-experiment.config.trial_command = 'python scripts/trial_sklearn.py'
-experiment.config.trial_code_directory = './'
-experiment.config.tuner.name = 'TPE'
-experiment.config.tuner.class_args['optimize_mode'] = 'maximize'
-experiment.config.training_service.use_active_gpu = True
-# %%
-# Start Experiment
-# ----------------
-experiment.start(8080)
-# %%
-# Experiment View & Control
-# -------------------------
-#
-# View the status of experiment.
-experiment.get_status()
-# %%
-# Wait until at least one trial finishes.
-import time
-for _ in range(10):
-    stats = experiment.get_job_statistics()
-    if any(stat['trialJobStatus'] == 'SUCCEEDED' for stat in stats):
-        break
-    time.sleep(10)
-# %%
-# Export the experiment data.
-experiment.export_data()
-# %%
-# Get metric of jobs
-experiment.get_job_metrics()
-# %%
-# Stop Experiment
-# ---------------
-experiment.stop()
--- a/examples/tutorials/pruning_customize.py
+++ b/examples/tutorials/pruning_customize.py
+"""
+Customize Basic Pruner
+======================
+Users can easily customize a basic pruner in NNI. A large number of basic modules have been provided and can be reused.
+Follow the NNI pruning interface, users only need to focus on their creative parts without worrying about other regular modules.
+In this tutorial, we show how to customize a basic pruner.
+Concepts
+--------
+NNI abstracts the basic pruning process into three steps, collecting data, calculating metrics, allocating sparsity.
+Most pruning algorithms rely on a metric to decide where should be pruned. Using L1 norm pruner as an example,
+the first step is collecting model weights, the second step is calculating L1 norm for weight per output channel,
+the third step is ranking L1 norm metric and masking the output channels that have small L1 norm.
+In NNI basic pruner, these three step is implement as ``DataCollector``, ``MetricsCalculator`` and ``SparsityAllocator``.
+-   ``DataCollector``: This module take pruner as initialize parameter.
+    It will get the relevant information of the model from the pruner,
+    and sometimes it will also hook the model to get input, output or gradient of a layer or a tensor.
+    It can also patch optimizer if some special steps need to be executed before or after ``optimizer.step()``.
+-   ``MetricsCalculator``: This module will take the data collected from the ``DataCollector``,
+    then calculate the metrics. The metric shape is usually reduced from the data shape.
+    The ``dim`` taken by ``MetricsCalculator`` means which dimension will be kept after calculate metrics.
+    i.e., the collected data shape is (10, 20, 30), and the ``dim`` is 1, then the dimension-1 will be kept,
+    the output metrics shape should be (20,).
+-   ``SparsityAllocator``: This module take the metrics and generate the masks.
+    Different ``SparsityAllocator`` has different masks generation strategies.
+    A common and simple strategy is sorting the metrics' values and calculating a threshold according to the configured sparsity,
+    mask the positions which metric value smaller than the threshold.
+    The ``dim`` taken by ``SparsityAllocator`` means the metrics are for which dimension, the mask will be expanded to weight shape.
+    i.e., the metric shape is (20,), the corresponding layer weight shape is (20, 40), and the ``dim`` is 0.
+    ``SparsityAllocator`` will first generate a mask with shape (20,), then expand this mask to shape (20, 40).
+Simple Example: Customize a Block-L1NormPruner
+----------------------------------------------
+NNI already have L1NormPruner, but for the reason of reproducing the paper and reducing user configuration items,
+it only support pruning layer output channels. In this example, we will customize a pruner that supports block granularity for Linear.
+Note that you don't need to implement all these three kinds of tools for each time,
+NNI supports many predefined tools, and you can directly use these to customize your own pruner.
+This is a tutorial so we show how to define all these three kinds of pruning tools.
+Customize the pruning tools used by the pruner at first.
+"""
+import torch
+from nni.algorithms.compression.v2.pytorch.pruning.basic_pruner import BasicPruner
+from nni.algorithms.compression.v2.pytorch.pruning.tools import (
+    DataCollector,
+    MetricsCalculator,
+    SparsityAllocator
+)
+# This data collector collects weight in wrapped module as data.
+# The wrapped module is the module configured in pruner's config_list.
+# This implementation is similar as nni.algorithms.compression.v2.pytorch.pruning.tools.WeightDataCollector
+class WeightDataCollector(DataCollector):
+    def collect(self):
+        data = {}
+        # get_modules_wrapper will get all the wrapper in the compressor (pruner),
+        # it returns a dict with format {wrapper_name: wrapper},
+        # use wrapper.module to get the wrapped module.
+        for _, wrapper in self.compressor.get_modules_wrapper().items():
+            data[wrapper.name] = wrapper.module.weight.data
+        # return {wrapper_name: weight_data}
+        return data
+class BlockNormMetricsCalculator(MetricsCalculator):
+    def __init__(self, block_sparse_size):
+        # Because we will keep all dimension with block granularity, so fix ``dim=None``,
+        # means all dimensions will be kept.
+        super().__init__(dim=None, block_sparse_size=block_sparse_size)
+    def calculate_metrics(self, data):
+        data_length = len(self.block_sparse_size)
+        reduce_unfold_dims = list(range(data_length, 2 * data_length))
+        metrics = {}
+        for name, t in data.items():
+            # Unfold t as block size, and calculate L1 Norm for each block.
+            for dim, size in enumerate(self.block_sparse_size):
+                t = t.unfold(dim, size, size)
+            metrics[name] = t.norm(dim=reduce_unfold_dims, p=1)
+        # return {wrapper_name: block_metric}
+        return metrics
+# This implementation is similar as nni.algorithms.compression.v2.pytorch.pruning.tools.NormalSparsityAllocator
+class BlockSparsityAllocator(SparsityAllocator):
+    def __init__(self, pruner, block_sparse_size):
+        super().__init__(pruner, dim=None, block_sparse_size=block_sparse_size, continuous_mask=True)
+    def generate_sparsity(self, metrics):
+        masks = {}
+        for name, wrapper in self.pruner.get_modules_wrapper().items():
+            # wrapper.config['total_sparsity'] can get the configured sparsity ratio for this wrapped module
+            sparsity_rate = wrapper.config['total_sparsity']
+            # get metric for this wrapped module
+            metric = metrics[name]
+            # mask the metric with old mask, if the masked position need never recover,
+            # just keep this is ok if you are new in NNI pruning
+            if self.continuous_mask:
+                metric *= self._compress_mask(wrapper.weight_mask)
+            # convert sparsity ratio to prune number
+            prune_num = int(sparsity_rate * metric.numel())
+            # calculate the metric threshold
+            threshold = torch.topk(metric.view(-1), prune_num, largest=False)[0].max()
+            # generate mask, keep the metric positions that metric values greater than the threshold
+            mask = torch.gt(metric, threshold).type_as(metric)
+            # expand the mask to weight size, if the block is masked, this block will be filled with zeros,
+            # otherwise filled with ones
+            masks[name] = self._expand_mask(name, mask)
+            # merge the new mask with old mask, if the masked position need never recover,
+            # just keep this is ok if you are new in NNI pruning
+            if self.continuous_mask:
+                masks[name]['weight'] *= wrapper.weight_mask
+        return masks
+# %%
+# Customize the pruner.
+class BlockL1NormPruner(BasicPruner):
+    def __init__(self, model, config_list, block_sparse_size):
+        self.block_sparse_size = block_sparse_size
+        super().__init__(model, config_list)
+    # Implement reset_tools is enough for this pruner.
+    def reset_tools(self):
+        if self.data_collector is None:
+            self.data_collector = WeightDataCollector(self)
+        else:
+            self.data_collector.reset()
+        if self.metrics_calculator is None:
+            self.metrics_calculator = BlockNormMetricsCalculator(self.block_sparse_size)
+        if self.sparsity_allocator is None:
+            self.sparsity_allocator = BlockSparsityAllocator(self, self.block_sparse_size)
+# %%
+# Try this pruner.
+# Define a simple model.
+class TestModel(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(4, 8)
+        self.fc2 = torch.nn.Linear(8, 4)
+    def forward(self, x):
+        return self.fc2(self.fc1(x))
+model = TestModel()
+config_list = [{'op_types': ['Linear'], 'total_sparsity': 0.5}]
+# use 2x2 block
+_, masks = BlockL1NormPruner(model, config_list, [2, 2]).compress()
+# show the generated masks
+print('fc1 masks:\n', masks['fc1']['weight'])
+print('fc2 masks:\n', masks['fc2']['weight'])
+# %%
+# This time we successfully define a new pruner with pruning block granularity!
+# Note that we don't put validation logic in this example, like ``_validate_config_before_canonical``,
+# but for a robust implementation, we suggest you involve the validation logic.
--- a/examples/tutorials/pruning_quick_start_mnist.py
+++ b/examples/tutorials/pruning_quick_start_mnist.py
+"""
+Pruning Quickstart
+==================
+Model pruning is a technique to reduce the model size and computation by reducing model weight size or intermediate state size.
+There are three common practices for pruning a DNN model:
+#. Pre-training a model -> Pruning the model -> Fine-tuning the pruned model
+#. Pruning a model during training (i.e., pruning aware training) -> Fine-tuning the pruned model
+#. Pruning a model -> Training the pruned model from scratch
+NNI supports all of the above pruning practices by working on the key pruning stage.
+Following this tutorial for a quick look at how to use NNI to prune a model in a common practice.
+"""
+# %%
+# Preparation
+# -----------
+#
+# In this tutorial, we use a simple model and pre-trained on MNIST dataset.
+# If you are familiar with defining a model and training in pytorch, you can skip directly to `Pruning Model`_.
+import torch
+import torch.nn.functional as F
+from torch.optim import SGD
+from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device
+# define the model
+model = TorchModel().to(device)
+# show the model structure, note that pruner will wrap the model layer.
+print(model)
+# %%
+# define the optimizer and criterion for pre-training
+optimizer = SGD(model.parameters(), 1e-2)
+criterion = F.nll_loss
+# pre-train and evaluate the model on MNIST dataset
+for epoch in range(3):
+    trainer(model, optimizer, criterion)
+    evaluator(model)
+# %%
+# Pruning Model
+# -------------
+#
+# Using L1NormPruner to prune the model and generate the masks.
+# Usually, a pruner requires original model and ``config_list`` as its inputs.
+# Detailed about how to write ``config_list`` please refer :doc:`compression config specification <../compression/compression_config_list>`.
+#
+# The following `config_list` means all layers whose type is `Linear` or `Conv2d` will be pruned,
+# except the layer named `fc3`, because `fc3` is `exclude`.
+# The final sparsity ratio for each layer is 50%. The layer named `fc3` will not be pruned.
+config_list = [{
+    'sparsity_per_layer': 0.5,
+    'op_types': ['Linear', 'Conv2d']
+}, {
+    'exclude': True,
+    'op_names': ['fc3']
+}]
+# %%
+# Pruners usually require `model` and `config_list` as input arguments.
+from nni.compression.pytorch.pruning import L1NormPruner
+pruner = L1NormPruner(model, config_list)
+# show the wrapped model structure, `PrunerModuleWrapper` have wrapped the layers that configured in the config_list.
+print(model)
+# %%
+# compress the model and generate the masks
+_, masks = pruner.compress()
+# show the masks sparsity
+for name, mask in masks.items():
+    print(name, ' sparsity : ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))
+# %%
+# Speedup the original model with masks, note that `ModelSpeedup` requires an unwrapped model.
+# The model becomes smaller after speedup,
+# and reaches a higher sparsity ratio because `ModelSpeedup` will propagate the masks across layers.
+# need to unwrap the model, if the model is wrapped before speedup
+pruner._unwrap_model()
+# speedup the model, for more information about speedup, please refer :doc:`pruning_speedup`.
+from nni.compression.pytorch.speedup import ModelSpeedup
+ModelSpeedup(model, torch.rand(3, 1, 28, 28).to(device), masks).speedup_model()
+# %%
+# the model will become real smaller after speedup
+print(model)
+# %%
+# Fine-tuning Compacted Model
+# ---------------------------
+# Note that if the model has been sped up, you need to re-initialize a new optimizer for fine-tuning.
+# Because speedup will replace the masked big layers with dense small ones.
+optimizer = SGD(model.parameters(), 1e-2)
+for epoch in range(3):
+    trainer(model, optimizer, criterion)
--- a/examples/tutorials/pruning_speedup.py
+++ b/examples/tutorials/pruning_speedup.py
+"""
+Speedup Model with Mask
+========================
+Introduction
+------------
+Pruning algorithms usually use weight masks to simulate the real pruning. Masks can be used
+to check model performance of a specific pruning (or sparsity), but there is no real speedup.
+Since model speedup is the ultimate goal of model pruning, we try to provide a tool to users
+to convert a model to a smaller one based on user provided masks (the masks come from the
+pruning algorithms).
+There are two types of pruning. One is fine-grained pruning, it does not change the shape of weights,
+and input/output tensors. Sparse kernel is required to speedup a fine-grained pruned layer.
+The other is coarse-grained pruning (e.g., channels), shape of weights and input/output tensors usually change due to such pruning.
+To speedup this kind of pruning, there is no need to use sparse kernel, just replace the pruned layer with smaller one.
+Since the support of sparse kernels in community is limited,
+we only support the speedup of coarse-grained pruning and leave the support of fine-grained pruning in future.
+Design and Implementation
+-------------------------
+To speedup a model, the pruned layers should be replaced, either replaced with smaller layer for coarse-grained mask,
+or replaced with sparse kernel for fine-grained mask. Coarse-grained mask usually changes the shape of weights or input/output tensors,
+thus, we should do shape inference to check are there other unpruned layers should be replaced as well due to shape change.
+Therefore, in our design, there are two main steps: first, do shape inference to find out all the modules that should be replaced;
+second, replace the modules.
+The first step requires topology (i.e., connections) of the model, we use ``jit.trace`` to obtain the model graph for PyTorch.
+The new shape of module is auto-inference by NNI, the unchanged parts of outputs during forward and inputs during backward are prepared for reduct.
+For each type of module, we should prepare a function for module replacement.
+The module replacement function returns a newly created module which is smaller.
+Usage
+-----
+"""
+# %%
+# Generate a mask for the model at first.
+# We usually use a NNI pruner to generate the masks then use ``ModelSpeedup`` to compact the model.
+# But in fact ``ModelSpeedup`` is a relatively independent tool, so you can use it independently.
+import torch
+from scripts.compression_mnist_model import TorchModel, device
+model = TorchModel().to(device)
+# masks = {layer_name: {'weight': weight_mask, 'bias': bias_mask}}
+conv1_mask = torch.ones_like(model.conv1.weight.data)
+# mask the first three output channels in conv1
+conv1_mask[0: 3] = 0
+masks = {'conv1': {'weight': conv1_mask}}
+# %%
+# Show the original model structure.
+print(model)
+# %%
+# Roughly test the original model inference speed.
+import time
+start = time.time()
+model(torch.rand(128, 1, 28, 28).to(device))
+print('Original Model - Elapsed Time : ', time.time() - start)
+# %%
+# Speedup the model and show the model structure after speedup.
+from nni.compression.pytorch import ModelSpeedup
+ModelSpeedup(model, torch.rand(10, 1, 28, 28).to(device), masks).speedup_model()
+print(model)
+# %%
+# Roughly test the model after speedup inference speed.
+start = time.time()
+model(torch.rand(128, 1, 28, 28).to(device))
+print('Speedup Model - Elapsed Time : ', time.time() - start)
+# %%
+# For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,
+# please refer to :doc:`Pruning Quick Start <pruning_quick_start_mnist>`.
+#
+# NOTE: The current implementation supports PyTorch 1.3.1 or newer.
+#
+# Limitations
+# -----------
+#
+# For PyTorch we can only replace modules, if functions in ``forward`` should be replaced,
+# our current implementation does not work. One workaround is make the function a PyTorch module.
+#
+# If you want to speedup your own model which cannot supported by the current implementation,
+# you need implement the replace function for module replacement, welcome to contribute.
+#
+# Speedup Results of Examples
+# ---------------------------
+#
+# The code of these experiments can be found :githublink:`here <examples/model_compress/pruning/legacy/speedup/model_speedup.py>`.
+#
+# These result are tested on the `legacy pruning framework <https://nni.readthedocs.io/en/v2.6/Compression/pruning.html>`_, new results will coming soon.
+#
+# slim pruner example
+# ^^^^^^^^^^^^^^^^^^^
+#
+# on one V100 GPU,
+# input tensor: ``torch.randn(64, 3, 32, 32)``
+#
+# .. list-table::
+#    :header-rows: 1
+#    :widths: auto
+#
+#    * - Times
+#      - Mask Latency
+#      - Speedup Latency
+#    * - 1
+#      - 0.01197
+#      - 0.005107
+#    * - 2
+#      - 0.02019
+#      - 0.008769
+#    * - 4
+#      - 0.02733
+#      - 0.014809
+#    * - 8
+#      - 0.04310
+#      - 0.027441
+#    * - 16
+#      - 0.07731
+#      - 0.05008
+#    * - 32
+#      - 0.14464
+#      - 0.10027
+#
+# fpgm pruner example
+# ^^^^^^^^^^^^^^^^^^^
+#
+# on cpu,
+# input tensor: ``torch.randn(64, 1, 28, 28)``\ ,
+# too large variance
+#
+# .. list-table::
+#    :header-rows: 1
+#    :widths: auto
+#
+#    * - Times
+#      - Mask Latency
+#      - Speedup Latency
+#    * - 1
+#      - 0.01383
+#      - 0.01839
+#    * - 2
+#      - 0.01167
+#      - 0.003558
+#    * - 4
+#      - 0.01636
+#      - 0.01088
+#    * - 40
+#      - 0.14412
+#      - 0.08268
+#    * - 40
+#      - 1.29385
+#      - 0.14408
+#    * - 40
+#      - 0.41035
+#      - 0.46162
+#    * - 400
+#      - 6.29020
+#      - 5.82143
+#
+# l1filter pruner example
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# on one V100 GPU,
+# input tensor: ``torch.randn(64, 3, 32, 32)``
+#
+# .. list-table::
+#    :header-rows: 1
+#    :widths: auto
+#
+#    * - Times
+#      - Mask Latency
+#      - Speedup Latency
+#    * - 1
+#      - 0.01026
+#      - 0.003677
+#    * - 2
+#      - 0.01657
+#      - 0.008161
+#    * - 4
+#      - 0.02458
+#      - 0.020018
+#    * - 8
+#      - 0.03498
+#      - 0.025504
+#    * - 16
+#      - 0.06757
+#      - 0.047523
+#    * - 32
+#      - 0.10487
+#      - 0.086442
+#
+# APoZ pruner example
+# ^^^^^^^^^^^^^^^^^^^
+#
+# on one V100 GPU,
+# input tensor: ``torch.randn(64, 3, 32, 32)``
+#
+# .. list-table::
+#    :header-rows: 1
+#    :widths: auto
+#
+#    * - Times
+#      - Mask Latency
+#      - Speedup Latency
+#    * - 1
+#      - 0.01389
+#      - 0.004208
+#    * - 2
+#      - 0.01628
+#      - 0.008310
+#    * - 4
+#      - 0.02521
+#      - 0.014008
+#    * - 8
+#      - 0.03386
+#      - 0.023923
+#    * - 16
+#      - 0.06042
+#      - 0.046183
+#    * - 32
+#      - 0.12421
+#      - 0.087113
+#
+# SimulatedAnnealing pruner example
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# In this experiment, we use SimulatedAnnealing pruner to prune the resnet18 on the cifar10 dataset.
+# We measure the latencies and accuracies of the pruned model under different sparsity ratios, as shown in the following figure.
+# The latency is measured on one V100 GPU and the input tensor is  ``torch.randn(128, 3, 32, 32)``.
+#
+# .. image:: ../../img/SA_latency_accuracy.png
--- a/examples/tutorials/quantization_customize.py
+++ b/examples/tutorials/quantization_customize.py
+"""
+Customize a new quantization algorithm
+======================================
+To write a new quantization algorithm, you can write a class that inherits ``nni.compression.pytorch.Quantizer``.
+Then, override the member functions with the logic of your algorithm. The member function to override is ``quantize_weight``.
+``quantize_weight`` directly returns the quantized weights rather than mask, because for quantization the quantized weights cannot be obtained by applying mask.
+"""
+from nni.compression.pytorch import Quantizer
+class YourQuantizer(Quantizer):
+    def __init__(self, model, config_list):
+        """
+        Suggest you to use the NNI defined spec for config
+        """
+        super().__init__(model, config_list)
+    def quantize_weight(self, weight, config, **kwargs):
+        """
+        quantize should overload this method to quantize weight tensors.
+        This method is effectively hooked to :meth:`forward` of the model.
+        Parameters
+        ----------
+        weight : Tensor
+            weight that needs to be quantized
+        config : dict
+            the configuration for weight quantization
+        """
+        # Put your code to generate `new_weight` here
+        new_weight = ...
+        return new_weight
+    def quantize_output(self, output, config, **kwargs):
+        """
+        quantize should overload this method to quantize output.
+        This method is effectively hooked to `:meth:`forward` of the model.
+        Parameters
+        ----------
+        output : Tensor
+            output that needs to be quantized
+        config : dict
+            the configuration for output quantization
+        """
+        # Put your code to generate `new_output` here
+        new_output = ...
+        return new_output
+    def quantize_input(self, *inputs, config, **kwargs):
+        """
+        quantize should overload this method to quantize input.
+        This method is effectively hooked to :meth:`forward` of the model.
+        Parameters
+        ----------
+        inputs : Tensor
+            inputs that needs to be quantized
+        config : dict
+            the configuration for inputs quantization
+        """
+        # Put your code to generate `new_input` here
+        new_input = ...
+        return new_input
+    def update_epoch(self, epoch_num):
+        pass
+    def step(self):
+        """
+        Can do some processing based on the model or weights binded
+        in the func bind_model
+        """
+        pass
+# %%
+# Customize backward function
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Sometimes it's necessary for a quantization operation to have a customized backward function,
+# such as `Straight-Through Estimator <https://stackoverflow.com/questions/38361314/the-concept-of-straight-through-estimator-ste>`__\ ,
+# user can customize a backward function as follow:
+from nni.compression.pytorch.compressor import Quantizer, QuantGrad, QuantType
+class ClipGrad(QuantGrad):
+    @staticmethod
+    def quant_backward(tensor, grad_output, quant_type):
+        """
+        This method should be overrided by subclass to provide customized backward function,
+        default implementation is Straight-Through Estimator
+        Parameters
+        ----------
+        tensor : Tensor
+            input of quantization operation
+        grad_output : Tensor
+            gradient of the output of quantization operation
+        quant_type : QuantType
+            the type of quantization, it can be `QuantType.INPUT`, `QuantType.WEIGHT`, `QuantType.OUTPUT`,
+            you can define different behavior for different types.
+        Returns
+        -------
+        tensor
+            gradient of the input of quantization operation
+        """
+        # for quant_output function, set grad to zero if the absolute value of tensor is larger than 1
+        if quant_type == QuantType.OUTPUT:
+            grad_output[tensor.abs() > 1] = 0
+        return grad_output
+class _YourQuantizer(Quantizer):
+    def __init__(self, model, config_list):
+        super().__init__(model, config_list)
+        # set your customized backward function to overwrite default backward function
+        self.quant_grad = ClipGrad
+# %%
+# If you do not customize ``QuantGrad``, the default backward is Straight-Through Estimator. 
--- a/examples/tutorials/quantization_quick_start_mnist.py
+++ b/examples/tutorials/quantization_quick_start_mnist.py
+"""
+Quantization Quickstart
+=======================
+Quantization reduces model size and speeds up inference time by reducing the number of bits required to represent weights or activations.
+In NNI, both post-training quantization algorithms and quantization-aware training algorithms are supported.
+Here we use `QAT_Quantizer` as an example to show the usage of quantization in NNI.
+"""
+# %%
+# Preparation
+# -----------
+#
+# In this tutorial, we use a simple model and pre-train on MNIST dataset.
+# If you are familiar with defining a model and training in pytorch, you can skip directly to `Quantizing Model`_.
+import torch
+import torch.nn.functional as F
+from torch.optim import SGD
+from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt
+# define the model
+model = TorchModel().to(device)
+# define the optimizer and criterion for pre-training
+optimizer = SGD(model.parameters(), 1e-2)
+criterion = F.nll_loss
+# pre-train and evaluate the model on MNIST dataset
+for epoch in range(3):
+    trainer(model, optimizer, criterion)
+    evaluator(model)
+# %%
+# Quantizing Model
+# ----------------
+#
+# Initialize a `config_list`.
+# Detailed about how to write ``config_list`` please refer :doc:`compression config specification <../compression/compression_config_list>`.
+config_list = [{
+    'quant_types': ['input', 'weight'],
+    'quant_bits': {'input': 8, 'weight': 8},
+    'op_types': ['Conv2d']
+}, {
+    'quant_types': ['output'],
+    'quant_bits': {'output': 8},
+    'op_types': ['ReLU']
+}, {
+    'quant_types': ['input', 'weight'],
+    'quant_bits': {'input': 8, 'weight': 8},
+    'op_names': ['fc1', 'fc2']
+}]
+# %%
+# finetuning the model by using QAT
+from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+dummy_input = torch.rand(32, 1, 28, 28).to(device)
+quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
+quantizer.compress()
+# %%
+# The model has now been wrapped, and quantization targets ('quant_types' setting in `config_list`)
+# will be quantized & dequantized for simulated quantization in the wrapped layers.
+# QAT is a training-aware quantizer, it will update scale and zero point during training.
+for epoch in range(3):
+    trainer(model, optimizer, criterion)
+    evaluator(model)
+# %%
+# export model and get calibration_config
+model_path = "./log/mnist_model.pth"
+calibration_path = "./log/mnist_calibration.pth"
+calibration_config = quantizer.export_model(model_path, calibration_path)
+print("calibration_config: ", calibration_config)
+# %%
+# build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.
+from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+input_shape = (32, 1, 28, 28)
+engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+engine.compress()
+test_trt(engine)
--- a/examples/tutorials/quantization_speedup.py
+++ b/examples/tutorials/quantization_speedup.py
+"""
+SpeedUp Model with Calibration Config
+======================================
+Introduction
+------------
+Deep learning network has been computational intensive and memory intensive 
+which increases the difficulty of deploying deep neural network model. Quantization is a 
+fundamental technology which is widely used to reduce memory footprint and speedup inference 
+process. Many frameworks begin to support quantization, but few of them support mixed precision 
+quantization and get real speedup. Frameworks like `HAQ: Hardware-Aware Automated Quantization with Mixed Precision <https://arxiv.org/pdf/1811.08886.pdf>`__\, only support simulated mixed precision quantization which will 
+not speedup the inference process. To get real speedup of mixed precision quantization and 
+help people get the real feedback from hardware, we design a general framework with simple interface to allow NNI quantization algorithms to connect different 
+DL model optimization backends (e.g., TensorRT, NNFusion), which gives users an end-to-end experience that after quantizing their model 
+with quantization algorithms, the quantized model can be directly speeded up with the connected optimization backend. NNI connects 
+TensorRT at this stage, and will support more backends in the future.
+Design and Implementation
+-------------------------
+To support speeding up mixed precision quantization, we divide framework into two part, frontend and backend.  
+Frontend could be popular training frameworks such as PyTorch, TensorFlow etc. Backend could be inference 
+framework for different hardwares, such as TensorRT. At present, we support PyTorch as frontend and 
+TensorRT as backend. To convert PyTorch model to TensorRT engine, we leverage onnx as intermediate graph 
+representation. In this way, we convert PyTorch model to onnx model, then TensorRT parse onnx 
+model to generate inference engine. 
+Quantization aware training combines NNI quantization algorithm 'QAT' and NNI quantization speedup tool.
+Users should set config to train quantized model using QAT algorithm(please refer to :doc:`NNI Quantization Algorithms <../compression/quantizer>`  ).
+After quantization aware training, users can get new config with calibration parameters and model with quantized weight. By passing new config and model to quantization speedup tool, users can get real mixed precision speedup engine to do inference.
+After getting mixed precision engine, users can do inference with input data.
+Note
+* Recommend using "cpu"(host) as data device(for both inference data and calibration data) since data should be on host initially and it will be transposed to device before inference. If data type is not "cpu"(host), this tool will transpose it to "cpu" which may increases unnecessary overhead.
+* User can also do post-training quantization leveraging TensorRT directly(need to provide calibration dataset).
+* Not all op types are supported right now. At present, NNI supports Conv, Linear, Relu and MaxPool. More op types will be supported in the following release.
+Prerequisite
+------------
+CUDA version >= 11.0
+TensorRT version >= 7.2
+Note
+* If you haven't installed TensorRT before or use the old version, please refer to `TensorRT Installation Guide <https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html>`__\  
+Usage
+-----
+"""
+# %%
+import torch
+import torch.nn.functional as F
+from torch.optim import SGD
+from scripts.compression_mnist_model import TorchModel, device, trainer, evaluator, test_trt
+config_list = [{
+    'quant_types': ['input', 'weight'],
+    'quant_bits': {'input': 8, 'weight': 8},
+    'op_types': ['Conv2d']
+}, {
+    'quant_types': ['output'],
+    'quant_bits': {'output': 8},
+    'op_types': ['ReLU']
+}, {
+    'quant_types': ['input', 'weight'],
+    'quant_bits': {'input': 8, 'weight': 8},
+    'op_names': ['fc1', 'fc2']
+}]
+model = TorchModel().to(device)
+optimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)
+criterion = F.nll_loss
+dummy_input = torch.rand(32, 1, 28, 28).to(device)
+from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
+quantizer.compress()
+# %%
+# finetuning the model by using QAT
+for epoch in range(3):
+    trainer(model, optimizer, criterion)
+    evaluator(model)
+# %%
+# export model and get calibration_config
+import os
+os.makedirs('log', exist_ok=True)
+model_path = "./log/mnist_model.pth"
+calibration_path = "./log/mnist_calibration.pth"
+calibration_config = quantizer.export_model(model_path, calibration_path)
+print("calibration_config: ", calibration_config)
+# %%
+# build tensorRT engine to make a real speedup
+from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+input_shape = (32, 1, 28, 28)
+engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+engine.compress()
+test_trt(engine)
+# %%
+# Note that NNI also supports post-training quantization directly, please refer to complete examples for detail.
+#
+# For complete examples please refer to :githublink:`the code <examples/model_compress/quantization/mixed_precision_speedup_mnist.py>`.
+#
+# For more parameters about the class 'TensorRTModelSpeedUp', you can refer to :doc:`Model Compression API Reference <../reference/compression/quantization_speedup>`.
+#
+# Mnist test
+# ^^^^^^^^^^
+#
+# on one GTX2080 GPU,
+# input tensor: ``torch.randn(128, 1, 28, 28)``
+#
+# .. list-table::
+#    :header-rows: 1
+#    :widths: auto
+#
+#    * - quantization strategy
+#      - Latency
+#      - accuracy
+#    * - all in 32bit
+#      - 0.001199961
+#      - 96%
+#    * - mixed precision(average bit 20.4)
+#      - 0.000753688
+#      - 96%
+#    * - all in 8bit
+#      - 0.000229869
+#      - 93.7%
+#
+# Cifar10 resnet18 test (train one epoch)
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# on one GTX2080 GPU,
+# input tensor: ``torch.randn(128, 3, 32, 32)``
+#
+# .. list-table::
+#    :header-rows: 1
+#    :widths: auto
+#
+#    * - quantization strategy
+#      - Latency
+#      - accuracy
+#    * - all in 32bit
+#      - 0.003286268
+#      - 54.21%
+#    * - mixed precision(average bit 11.55)
+#      - 0.001358022
+#      - 54.78%
+#    * - all in 8bit
+#      - 0.000859139
+#      - 52.81%
--- a/examples/tutorials/scripts/compression_mnist_model.py
+++ b/examples/tutorials/scripts/compression_mnist_model.py
+from pathlib import Path
+root_path = Path(__file__).parent.parent
+# define the model
+import torch
+from torch import nn
+from torch.nn import functional as F
+class TorchModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5, 1)
+        self.conv2 = nn.Conv2d(6, 16, 5, 1)
+        self.fc1 = nn.Linear(16 * 4 * 4, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.ReLU()
+        self.relu3 = nn.ReLU()
+        self.relu4 = nn.ReLU()
+        self.pool1 = nn.MaxPool2d((2, 2))
+        self.pool2 = nn.MaxPool2d((2, 2))
+    def forward(self, x):
+        x = self.pool1(self.relu1(self.conv1(x)))
+        x = self.pool2(self.relu2(self.conv2(x)))
+        x = torch.flatten(x, 1)
+        x = self.relu3(self.fc1(x))
+        x = self.relu4(self.fc2(x))
+        x = self.fc3(x)
+        return F.log_softmax(x, dim=1)
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if use_cuda else "cpu")
+# load data
+from torchvision import datasets, transforms
+train_loader = torch.utils.data.DataLoader(
+    datasets.MNIST(root_path / 'data', train=True, download=True, transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])), batch_size=128, shuffle=True)
+test_loader = torch.utils.data.DataLoader(
+    datasets.MNIST(root_path / 'data', train=False, transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])), batch_size=1000, shuffle=True)
+# define the trainer and evaluator
+def trainer(model, optimizer, criterion):
+    # training the model
+    model.train()
+    for data, target in train_loader:
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+def evaluator(model):
+    # evaluating the model accuracy and average test loss
+    model.eval()
+    test_loss = 0
+    correct = 0
+    test_dataset_length = len(test_loader.dataset)
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            # sum up batch loss
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            # get the index of the max log-probability
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= test_dataset_length
+    accuracy = 100. * correct / test_dataset_length
+    print('Average test loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(test_loss, correct, test_dataset_length, accuracy))
+def test_trt(engine):
+    test_loss = 0
+    correct = 0
+    time_elasped = 0
+    for data, target in test_loader:
+        output, time = engine.inference(data)
+        test_loss += F.nll_loss(output, target, reduction='sum').item()
+        pred = output.argmax(dim=1, keepdim=True)
+        correct += pred.eq(target.view_as(pred)).sum().item()
+        time_elasped += time
+    test_loss /= len(test_loader.dataset)
+    print('Loss: {}  Accuracy: {}%'.format(
+        test_loss, 100 * correct / len(test_loader.dataset)))
+    print("Inference elapsed_time (whole dataset): {}s".format(time_elasped))
--- a/nni/__init__.py
+++ b/nni/__init__.py
@@ -10,6 +10,7 @@ from .runtime.log import init_logger
 init_logger()
 from .common.serializer import trace, dump, load
+from .experiment import Experiment
 from .runtime.env_vars import dispatcher_env_vars
 from .utils import ClassArgsValidator
@@ -19,7 +20,7 @@ if dispatcher_env_vars.SDK_PROCESS != 'dispatcher':
    from .common.nas_utils import training_update
 class NoMoreTrialError(Exception):
-    def __init__(self, ErrorInfo):
+    def __init__(self, ErrorInfo='Search space fully explored'):
        super().__init__(self)
        self.errorinfo = ErrorInfo

--- a/nni/algorithms/compression/pytorch/auto_compress/experiment.py
+++ b/nni/algorithms/compression/pytorch/auto_compress/experiment.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+from __future__ import annotations
 import inspect
 from pathlib import Path, PurePath
-from typing import overload, Union, List
 from nni.experiment import Experiment, ExperimentConfig
 from nni.algorithms.compression.pytorch.auto_compress.interface import AbstractAutoCompressionModule
@@ -11,49 +12,19 @@ from nni.algorithms.compression.pytorch.auto_compress.interface import AbstractA
 class AutoCompressionExperiment(Experiment):
-    @overload
+    def __init__(self, auto_compress_module: AbstractAutoCompressionModule, config_or_platform: ExperimentConfig | str | list[str]) -> None:
-    def __init__(self, auto_compress_module: AbstractAutoCompressionModule, config: ExperimentConfig) -> None:
-        """
-        Prepare an experiment.
-        Use `Experiment.run()` to launch it.
-        Parameters
-        ----------
-        auto_compress_module
-            The module provided by the user implements the `AbstractAutoCompressionModule` interfaces.
-            Remember put the module file under `trial_code_directory`.
-        config
-            Experiment configuration.
-        """
-        ...
-    @overload
-    def __init__(self, auto_compress_module: AbstractAutoCompressionModule, training_service: Union[str, List[str]]) -> None:
        """
-        Prepare an experiment, leaving configuration fields to be set later.
+        Prepare an auto compression experiment.
-        Example usage::
-            experiment = Experiment(auto_compress_module, 'remote')
-            experiment.config.trial_command = 'python3 trial.py'
-            experiment.config.machines.append(RemoteMachineConfig(ip=..., user_name=...))
-            ...
-            experiment.run(8080)
        Parameters
        ----------
        auto_compress_module
            The module provided by the user implements the `AbstractAutoCompressionModule` interfaces.
            Remember put the module file under `trial_code_directory`.
-        training_service
+        config_or_platform
-            Name of training service.
+            Experiment configuration or training service name.
-            Supported value: "local", "remote", "openpai", "aml", "kubeflow", "frameworkcontroller", "adl" and hybrid training service.
        """
-        ...
+        super().__init__(config_or_platform)
-    def __init__(self, auto_compress_module: AbstractAutoCompressionModule, config=None, training_service=None):
-        super().__init__(config, training_service)
        self.module_file_path = str(PurePath(inspect.getfile(auto_compress_module)))
        self.module_name = auto_compress_module.__name__

--- a/nni/algorithms/compression/pytorch/pruning/auto_compress_pruner.py
+++ b/nni/algorithms/compression/pytorch/pruning/auto_compress_pruner.py
@@ -201,7 +201,7 @@ class AutoCompressPruner(Pruner):
            ADMMpruner.export_model(os.path.join(self._experiment_data_dir, 'model_admm_masked.pth'), os.path.join(
                self._experiment_data_dir, 'mask.pth'))
-            # use speed up to prune the model before next iteration,
+            # use speedup to prune the model before next iteration,
            # because SimulatedAnnealingPruner & ADMMPruner don't take masked models
            self._model_to_prune.load_state_dict(torch.load(os.path.join(
                self._experiment_data_dir, 'model_admm_masked.pth')))

--- a/nni/algorithms/compression/pytorch/pruning/dependency_aware_pruner.py
+++ b/nni/algorithms/compression/pytorch/pruning/dependency_aware_pruner.py
@@ -35,7 +35,7 @@ class DependencyAwarePruner(Pruner):
        if self.dependency_aware:
            if not self._supported_dependency_aware():
-                raise ValueError('This pruner does not support dependency aware!')
+                raise ValueError('This pruner does not support dependency-aware!')
            errmsg = "When dependency_aware is set, the dummy_input should not be None"
            assert self.dummy_input is not None, errmsg

--- a/nni/algorithms/compression/pytorch/pruning/sensitivity_pruner.py
+++ b/nni/algorithms/compression/pytorch/pruning/sensitivity_pruner.py
@@ -10,7 +10,7 @@ import torch
 from schema import And, Optional
 from nni.compression.pytorch.compressor import Pruner
 from nni.compression.pytorch.utils.config_validation import PrunerSchema
-from nni.compression.pytorch.utils.sensitivity_analysis import SensitivityAnalysis
+from nni.compression.pytorch.utils import SensitivityAnalysis
 from .constants_pruner import PRUNER_DICT

--- a/nni/algorithms/compression/pytorch/quantization/bnn_quantizer.py
+++ b/nni/algorithms/compression/pytorch/quantization/bnn_quantizer.py
@@ -22,9 +22,74 @@ class ClipGrad(QuantGrad):
 class BNNQuantizer(Quantizer):
-    """Binarized Neural Networks, as defined in:
+    r"""
-    Binarized Neural Networks: Training Deep Neural Networks with Weights and Outputs Constrained to +1 or -1
+    Binarized Neural Networks, as defined in:
-    (https://arxiv.org/abs/1602.02830)
+    `Binarized Neural Networks: Training Deep Neural Networks with Weights and
+    Activations Constrained to +1 or -1 <https://arxiv.org/abs/1602.02830>`__,
+    ..
+        We introduce a method to train Binarized Neural Networks (BNNs) - neural networks with binary weights and activations at run-time.
+        At training-time the binary weights and activations are used for computing the parameters gradients. During the forward pass,
+        BNNs drastically reduce memory size and accesses, and replace most arithmetic operations with bit-wise operations,
+        which is expected to substantially improve power-efficiency.
+    Parameters
+    ----------
+    model : torch.nn.Module
+        Model to be quantized.
+    config_list : List[Dict]
+        List of configurations for quantization. Supported keys for dict:
+            - quant_types : List[str]
+                Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
+            - quant_bits : Union[int, Dict[str, int]]
+                Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
+                When the type is int, all quantization types share same bits length.
+            - op_types : List[str]
+                Types of nn.module you want to apply quantization, eg. 'Conv2d'.
+            - op_names : List[str]
+                Names of nn.module you want to apply quantization, eg. 'conv1'.
+            - exclude : bool
+                Set True then the layers setting by op_types and op_names will be excluded from quantization.
+    optimizer : torch.optim.Optimizer
+        Optimizer is required in `BNNQuantizer`, NNI will patch the optimizer and count the optimize step number.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.pytorch.quantization import BNNQuantizer
+        >>> model = ...
+        >>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
+        >>> optimizer = ...
+        >>> quantizer = BNNQuantizer(model, config_list, optimizer)
+        >>> quantizer.compress()
+        >>> # Training Process...
+    For detailed example please refer to
+    :githublink:`examples/model_compress/quantization/BNN_quantizer_cifar10.py
+    <examples/model_compress/quantization/BNN_quantizer_cifar10.py>`.
+    Notes
+    -----
+    **Results**
+    We implemented one of the experiments in
+    `Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1
+    <https://arxiv.org/abs/1602.02830>`__,
+    we quantized the **VGGNet** for CIFAR-10 in the paper. Our experiments results are as follows:
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+        * - Model
+            - Accuracy
+        * - VGGNet
+            - 86.93%
+    The experiments code can be found at
+    :githublink:`examples/model_compress/quantization/BNN_quantizer_cifar10.py
+    <examples/model_compress/quantization/BNN_quantizer_cifar10.py>`
    """
    def __init__(self, model, config_list, optimizer):

--- a/nni/algorithms/compression/pytorch/quantization/dorefa_quantizer.py
+++ b/nni/algorithms/compression/pytorch/quantization/dorefa_quantizer.py
@@ -13,9 +13,45 @@ logger = logging.getLogger(__name__)
 class DoReFaQuantizer(Quantizer):
-    """Quantizer using the DoReFa scheme, as defined in:
+    r"""
-    Zhou et al., DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients
+    Quantizer using the DoReFa scheme, as defined in:
-    (https://arxiv.org/abs/1606.06160)
+    `DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients <https://arxiv.org/abs/1606.06160>`__,
+    authors Shuchang Zhou and Yuxin Wu provide an algorithm named DoReFa to quantize the weight, activation and gradients with training.
+    Parameters
+    ----------
+    model : torch.nn.Module
+        Model to be quantized.
+    config_list : List[Dict]
+        List of configurations for quantization. Supported keys for dict:
+            - quant_types : List[str]
+                Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
+            - quant_bits : Union[int, Dict[str, int]]
+                Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
+                When the type is int, all quantization types share same bits length.
+            - op_types : List[str]
+                Types of nn.module you want to apply quantization, eg. 'Conv2d'.
+            - op_names : List[str]
+                Names of nn.module you want to apply quantization, eg. 'conv1'.
+            - exclude : bool
+                Set True then the layers setting by op_types and op_names will be excluded from quantization.
+    optimizer : torch.optim.Optimizer
+        Optimizer is required in `DoReFaQuantizer`, NNI will patch the optimizer and count the optimize step number.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.pytorch.quantization import DoReFaQuantizer
+        >>> model = ...
+        >>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
+        >>> optimizer = ...
+        >>> quantizer = DoReFaQuantizer(model, config_list, optimizer)
+        >>> quantizer.compress()
+        >>> # Training Process...
+    For detailed example please refer to
+    :githublink:`examples/model_compress/quantization/DoReFaQuantizer_torch_mnist.py
+    <examples/model_compress/quantization/DoReFaQuantizer_torch_mnist.py>`.
    """
    def __init__(self, model, config_list, optimizer):

--- a/nni/algorithms/compression/pytorch/quantization/lsq_quantizer.py
+++ b/nni/algorithms/compression/pytorch/quantization/lsq_quantizer.py
@@ -11,35 +11,56 @@ logger = logging.getLogger(__name__)
 class LsqQuantizer(Quantizer):
-    """Quantizer defined in:
+    r"""
-       Learned Step Size Quantization (ICLR 2020)
+    Quantizer defined in: `LEARNED STEP SIZE QUANTIZATION <https://arxiv.org/pdf/1902.08153.pdf>`__,
-       https://arxiv.org/pdf/1902.08153.pdf
+    authors Steven K. Esser and Jeffrey L. McKinstry provide an algorithm to train the scales with gradients.
+    ..
+        The authors introduce a novel means to estimate and scale the task loss gradient at each weight and activation
+        layer's quantizer step size, such that it can be learned in conjunction with other network parameters.
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The model to be quantized.
+    config_list : List[Dict]
+        List of configurations for quantization. Supported keys for dict:
+            - quant_types : List[str]
+                Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
+            - quant_bits : Union[int, Dict[str, int]]
+                Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
+                When the type is int, all quantization types share same bits length.
+            - op_types : List[str]
+                Types of nn.module you want to apply quantization, eg. 'Conv2d'.
+            - op_names : List[str]
+                Names of nn.module you want to apply quantization, eg. 'conv1'.
+            - exclude : bool
+                Set True then the layers setting by op_types and op_names will be excluded from quantization.
+    optimizer : torch.optim.Optimizer
+        Optimizer is required in `LsqQuantizer`, NNI will patch the optimizer and count the optimize step number.
+    dummy_input : Tuple[torch.Tensor]
+        Inputs to the model, which are used to get the graph of the module. The graph is used to find Conv-Bn patterns.
+        And then the batch normalization folding would be enabled. If dummy_input is not given,
+        the batch normalization folding would be disabled.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.pytorch.quantization import LsqQuantizer
+        >>> model = ...
+        >>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
+        >>> optimizer = ...
+        >>> dummy_input = torch.rand(...)
+        >>> quantizer = LsqQuantizer(model, config_list, optimizer, dummy_input=dummy_input)
+        >>> quantizer.compress()
+        >>> # Training Process...
+    For detailed example please refer to
+    :githublink:`examples/model_compress/quantization/LSQ_torch_quantizer.py <examples/model_compress/quantization/LSQ_torch_quantizer.py>`.
    """
    def __init__(self, model, config_list, optimizer, dummy_input=None):
-        """
-        Parameters
-        ----------
-        model : torch.nn.Module
-            the model to be quantized
-        config_list : list of dict
-            list of configurations for quantization
-            supported keys for dict:
-                - quant_types : list of string
-                    type of quantization you want to apply, currently support 'weight', 'input', 'output'
-                - quant_bits : int or dict of {str : int}
-                    bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
-                    when the type is int, all quantization types share same bits length
-                - quant_start_step : int
-                    disable quantization until model are run by certain number of steps, this allows the network to enter a more stable
-                    state where output quantization ranges do not exclude a signiﬁcant fraction of values, default value is 0
-                - op_types : list of string
-                    types of nn.module you want to apply quantization, eg. 'Conv2d'
-                - dummy_input : tuple of tensor
-                    inputs to the model, which are used to get the graph of the module. The graph is used to find
-                    Conv-Bn patterns. And then the batch normalization folding would be enabled. If dummy_input is not
-                    given, the batch normalization folding would be disabled.
-        """
        assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type"
        super().__init__(model, config_list, optimizer, dummy_input)
        device = next(model.parameters()).device

--- a/nni/algorithms/compression/pytorch/quantization/native_quantizer.py
+++ b/nni/algorithms/compression/pytorch/quantization/native_quantizer.py
@@ -12,7 +12,32 @@ logger = logging.getLogger(__name__)
 class NaiveQuantizer(Quantizer):
-    """quantize weight to 8 bits
+    r"""
+    Quantize weight to 8 bits directly.
+    Parameters
+    ----------
+    model : torch.nn.Module
+        Model to be quantized.
+    config_list : List[Dict]
+        List of configurations for quantization. Supported keys:
+            - quant_types : List[str]
+                Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
+            - quant_bits : Union[int, Dict[str, int]]
+                Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
+                when the type is int, all quantization types share same bits length.
+            - op_types : List[str]
+                Types of nn.module you want to apply quantization, eg. 'Conv2d'.
+            - op_names : List[str]
+                Names of nn.module you want to apply quantization, eg. 'conv1'.
+            - exclude : bool
+                Set True then the layers setting by op_types and op_names will be excluded from quantization.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.pytorch.quantization import NaiveQuantizer
+        >>> model = ...
+        >>> NaiveQuantizer(model).compress()
    """
    def __init__(self, model, config_list, optimizer=None):

--- a/nni/algorithms/compression/pytorch/quantization/observer_quantizer.py
+++ b/nni/algorithms/compression/pytorch/quantization/observer_quantizer.py
@@ -14,7 +14,12 @@ logger = logging.getLogger(__name__)
 class ObserverQuantizer(Quantizer):
-    """This quantizer uses observers to record weight/output statistics to get quantization information.
+    r"""
+    Observer quantizer is a framework of post-training quantization.
+    It will insert observers into the place where the quantization will happen.
+    During quantization calibration, each observer will record all the tensors it 'sees'.
+    These tensors will be used to calculate the quantization statistics after calibration.
    The whole process can be divided into three steps:
    1. It will register observers to the place where quantization would happen (just like registering hooks).
@@ -23,6 +28,66 @@ class ObserverQuantizer(Quantizer):
    Note that the observer type, tensor dtype and quantization qscheme are hard coded for now. Their customization
    are under development and will be ready soon.
+    Parameters
+    ----------
+    model : torch.nn.Module
+        Model to be quantized.
+    config_list : List[Dict]
+        List of configurations for quantization. Supported keys:
+            - quant_types : List[str]
+                Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
+            - quant_bits : Union[int, Dict[str, int]]
+                Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
+                when the type is int, all quantization types share same bits length.
+            - op_types : List[str]
+                Types of nn.module you want to apply quantization, eg. 'Conv2d'.
+            - op_names : List[str]
+                Names of nn.module you want to apply quantization, eg. 'conv1'.
+            - exclude : bool
+                Set True then the layers setting by op_types and op_names will be excluded from quantization.
+    optimizer : torch.optim.Optimizer
+        Optimizer is optional in `ObserverQuantizer`.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.pytorch.quantization import ObserverQuantizer
+        >>> model = ...
+        >>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
+        >>> quantizer = ObserverQuantizer(model, config_list)
+        >>> # define a calibration function
+        >>> def calibration(model, calib_loader):
+        >>>     model.eval()
+        >>>     with torch.no_grad():
+        >>>         for data, _ in calib_loader:
+        >>>             model(data)
+        >>> calibration(model, calib_loader)
+        >>> quantizer.compress()
+    For detailed example please refer to
+    :githublink:`examples/model_compress/quantization/observer_quantizer.py <examples/model_compress/quantization/observer_quantizer.py>`.
+    .. note::
+        This quantizer is still under development for now. Some quantizer settings are hard-coded:
+        - weight observer: per_tensor_symmetric, qint8
+        - output observer: per_tensor_affine, quint8, reduce_range=True
+        Other settings (such as quant_type and op_names) can be configured.
+    Notes
+    -----
+    **About the compress API**
+    Before the `compress` API is called, the model will only record tensors' statistics and no quantization process will be executed.
+    After the `compress` API is called, the model will NOT record tensors' statistics any more. The quantization scale and zero point will
+    be generated for each tensor and will be used to quantize each tensor during inference (we call it evaluation mode)
+    **About calibration**
+    Usually we pick up about 100 training/evaluation examples for calibration. If you found the accuracy is a bit low, try
+    to reduce the number of calibration examples.
    """
    def __init__(self, model, config_list, optimizer=None):

--- a/nni/algorithms/compression/pytorch/quantization/qat_quantizer.py
+++ b/nni/algorithms/compression/pytorch/quantization/qat_quantizer.py
@@ -107,36 +107,151 @@ def update_ema(biased_ema, value, decay):
 class QAT_Quantizer(Quantizer):
-    """Quantizer defined in:
+    r"""
-    Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference
+    Quantizer defined in:
-    http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf
+    `Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference
+    <http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf>`__
+    Authors Benoit Jacob and Skirmantas Kligys provide an algorithm to quantize the model with training.
+    ..
+        We propose an approach that simulates quantization effects in the forward pass of training.
+        Backpropagation still happens as usual, and all weights and biases are stored in floating point
+        so that they can be easily nudged by small amounts.
+        The forward propagation pass however simulates quantized inference as it will happen in the inference engine,
+        by implementing in floating-point arithmetic the rounding behavior of the quantization scheme:
+        * Weights are quantized before they are convolved with the input. If batch normalization (see [17]) is used for the layer,
+          the batch normalization parameters are “folded into” the weights before quantization.
+        * Activations are quantized at points where they would be during inference,
+          e.g. after the activation function is applied to a convolutional or fully connected layer’s output,
+          or after a bypass connection adds or concatenates the outputs of several layers together such as in ResNets.
+    Parameters
+    ----------
+    model : torch.nn.Module
+        Model to be quantized.
+    config_list : List[Dict]
+        List of configurations for quantization. Supported keys for dict:
+            - quant_types : List[str]
+                Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
+            - quant_bits : Union[int, Dict[str, int]]
+                Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
+                When the type is int, all quantization types share same bits length.
+            - quant_start_step : int
+                Disable quantization until model are run by certain number of steps, this allows the network to enter a more stable.
+                State where output quantization ranges do not exclude a signiﬁcant fraction of values, default value is 0.
+            - op_types : List[str]
+                Types of nn.module you want to apply quantization, eg. 'Conv2d'.
+            - op_names : List[str]
+                Names of nn.module you want to apply quantization, eg. 'conv1'.
+            - exclude : bool
+                Set True then the layers setting by op_types and op_names will be excluded from quantization.
+    optimizer : torch.optim.Optimizer
+        Optimizer is required in `QAT_Quantizer`, NNI will patch the optimizer and count the optimize step number.
+    dummy_input : Tuple[torch.Tensor]
+        Inputs to the model, which are used to get the graph of the module. The graph is used to find Conv-Bn patterns.
+        And then the batch normalization folding would be enabled. If dummy_input is not given,
+        the batch normalization folding would be disabled.
+    Examples
+    --------
+        >>> from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+        >>> model = ...
+        >>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
+        >>> optimizer = ...
+        >>> dummy_input = torch.rand(...)
+        >>> quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input=dummy_input)
+        >>> quantizer.compress()
+        >>> # Training Process...
+    For detailed example please refer to
+    :githublink:`examples/model_compress/quantization/QAT_torch_quantizer.py <examples/model_compress/quantization/QAT_torch_quantizer.py>`.
+    Notes
+    -----
+    **Batch normalization folding**
+    Batch normalization folding is supported in QAT quantizer. It can be easily enabled by passing an argument `dummy_input` to
+    the quantizer, like:
+    .. code-block:: python
+        # assume your model takes an input of shape (1, 1, 28, 28)
+        # and dummy_input must be on the same device as the model
+        dummy_input = torch.randn(1, 1, 28, 28)
+        # pass the dummy_input to the quantizer
+        quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input=dummy_input)
+    The quantizer will automatically detect Conv-BN patterns and simulate batch normalization folding process in the training
+    graph. Note that when the quantization aware training process is finished, the folded weight/bias would be restored after calling
+    `quantizer.export_model`.
+    **Quantization dtype and scheme customization**
+    Different backends on different devices use different quantization strategies (i.e. dtype (int or uint) and
+    scheme (per-tensor or per-channel and symmetric or affine)). QAT quantizer supports customization of mainstream dtypes and schemes.
+    There are two ways to set them. One way is setting them globally through a function named `set_quant_scheme_dtype` like:
+    .. code-block:: python
+        from nni.compression.pytorch.quantization.settings import set_quant_scheme_dtype
+        # This will set all the quantization of 'input' in 'per_tensor_affine' and 'uint' manner
+        set_quant_scheme_dtype('input', 'per_tensor_affine', 'uint)
+        # This will set all the quantization of 'output' in 'per_tensor_symmetric' and 'int' manner
+        set_quant_scheme_dtype('output', 'per_tensor_symmetric', 'int')
+        # This will set all the quantization of 'weight' in 'per_channel_symmetric' and 'int' manner
+        set_quant_scheme_dtype('weight', 'per_channel_symmetric', 'int')
+    The other way is more detailed. You can customize the dtype and scheme in each quantization config list like:
+    .. code-block:: python
+        config_list = [{
+            'quant_types': ['weight'],
+            'quant_bits':  8,
+            'op_types':['Conv2d', 'Linear'],
+            'quant_dtype': 'int',
+            'quant_scheme': 'per_channel_symmetric'
+        }, {
+            'quant_types': ['output'],
+            'quant_bits': 8,
+            'quant_start_step': 7000,
+            'op_types':['ReLU6'],
+            'quant_dtype': 'uint',
+            'quant_scheme': 'per_tensor_affine'
+        }]
+    **Multi-GPU training**
+    QAT quantizer natively supports multi-gpu training (DataParallel and DistributedDataParallel). Note that the quantizer
+    instantiation should happen before you wrap your model with DataParallel or DistributedDataParallel. For example:
+    .. code-block:: python
+        from torch.nn.parallel import DistributedDataParallel as DDP
+        from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+        model = define_your_model()
+        model = QAT_Quantizer(model, **other_params)  # <--- QAT_Quantizer instantiation
+        model = DDP(model)
+        for i in range(epochs):
+            train(model)
+            eval(model)
    """
    def __init__(self, model, config_list, optimizer, dummy_input=None):
-        """
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to quantize
-        config_list : list of dict
-            list of configurations for quantization
-            supported keys for dict:
-                - quant_types : list of string
-                    type of quantization you want to apply, currently support 'weight', 'input', 'output'
-                - quant_bits : int or dict of {str : int}
-                    bits length of quantization, key is the quantization type, value is the length, eg. {'weight', 8},
-                    when the type is int, all quantization types share same bits length
-                - quant_start_step : int
-                    disable quantization until model are run by certain number of steps, this allows the network to enter a more stable
-                    state where output quantization ranges do not exclude a signiﬁcant fraction of values, default value is 0
-                - op_types : list of string
-                    types of nn.module you want to apply quantization, eg. 'Conv2d'
-                - dummy_input : tuple of tensor
-                    inputs to the model, which are used to get the graph of the module. The graph is used to find
-                    Conv-Bn patterns. And then the batch normalization folding would be enabled. If dummy_input is not
-                    given, the batch normalization folding would be disabled.
-        """
        assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type"
        super().__init__(model, config_list, optimizer, dummy_input)
        self.quant_grad = QATGrad.apply

--- a/nni/algorithms/compression/tensorflow/pruning/__init__.py
+++ b/nni/algorithms/compression/tensorflow/pruning/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
 from .one_shot_pruner import *