update CN model compression overview & quick start doc (#4445)

f1e70073 · J-shang · GitHub · 7bc1105f · f1e70073 · f1e70073
Unverified Commit f1e70073 authored Jan 12, 2022 by J-shang Committed by GitHub Jan 12, 2022
5 changed files
--- a/docs/en_US/Compression/QuickStart.rst
+++ b/docs/en_US/Compression/QuickStart.rst
@@ -28,7 +28,7 @@ Write a configuration to specify the layers that you want to prune. The followin
       'op_types': ['default'],
   }]

-The specification of configuration can be found `here <./Tutorial.rst#specify-the-configuration>`__. Note that different pruners may have their own defined fields in configuration, for exmaple ``start_epoch`` in AGP pruner. Please refer to each pruner's `usage <./Pruner.rst>`__ for details, and adjust the configuration accordingly.
+The specification of configuration can be found `here <./Tutorial.rst#specify-the-configuration>`__. Note that different pruners may have their own defined fields in configuration. Please refer to each pruner's `usage <./Pruner.rst>`__ for details, and adjust the configuration accordingly.

 Step2. Choose a pruner and compress the model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -80,9 +80,10 @@ Step1. Write configuration
 .. code-block:: python

   config_list = [{
-       'quant_types': ['weight'],
+       'quant_types': ['weight', 'input'],
       'quant_bits': {
           'weight': 8,
+           'input': 8,
       }, # you can just use `int` here because all `quan_types` share same bits length, see config for `ReLu6` below.
       'op_types':['Conv2d', 'Linear'],
       'quant_dtype': 'int',

--- a/docs/en_US/Compression/compression_pipeline_example.ipynb
+++ b/docs/en_US/Compression/compression_pipeline_example.ipynb
@@ -2,14 +2,16 @@
 "cells": [
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# 1. Prepare model"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn.functional as F\n",
@@ -36,13 +38,13 @@
    "        x = self.relu3(self.fc1(x))\n",
    "        x = self.fc2(x)\n",
    "        return F.log_softmax(x, dim=1)"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# define model, optimizer, criterion, data_loader, trainer, evaluator.\n",
    "\n",
@@ -97,27 +99,16 @@
    "        test_loss, correct, len(test_loader.dataset), acc))\n",
    "\n",
    "    return acc"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
-   "source": [
-    "# pre-train model for 3 epoches.\n",
-    "\n",
-    "scheduler = StepLR(optimizer, step_size=1, gamma=0.7)\n",
-    "\n",
-    "for epoch in range(0, 3):\n",
-    "    trainer(model, optimizer, criterion, epoch)\n",
-    "    evaluator(model)\n",
-    "    scheduler.step()"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "Train Epoch: 0 [0/60000 (0%)]\tLoss: 2.313423\n",
      "Train Epoch: 0 [6400/60000 (11%)]\tLoss: 0.091786\n",
@@ -161,20 +152,25 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# pre-train model for 3 epoches.\n",
+    "\n",
+    "scheduler = StepLR(optimizer, step_size=1, gamma=0.7)\n",
+    "\n",
+    "for epoch in range(0, 3):\n",
+    "    trainer(model, optimizer, criterion, epoch)\n",
+    "    evaluator(model)\n",
+    "    scheduler.step()"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
-   "source": [
-    "# show all op_name and op_type in the model.\n",
-    "\n",
-    "[print('op_name: {}\\nop_type: {}\\n'.format(name, type(module))) for name, module in model.named_modules()]"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "op_name: \n",
      "op_type: <class '__main__.NaiveModel'>\n",
@@ -209,49 +205,49 @@
     ]
    },
    {
-     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[None, None, None, None, None, None, None, None, None, None]"
      ]
     },
+     "execution_count": 4,
     "metadata": {},
-     "execution_count": 4
+     "output_type": "execute_result"
    }
   ],
-   "metadata": {}
+   "source": [
+    "# show all op_name and op_type in the model.\n",
+    "\n",
+    "[print('op_name: {}\\nop_type: {}\\n'.format(name, type(module))) for name, module in model.named_modules()]"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
-   "source": [
-    "# show the weight size of `conv1`.\n",
-    "\n",
-    "print(model.conv1.weight.data.size())"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "torch.Size([20, 1, 5, 5])\n"
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# show the weight size of `conv1`.\n",
+    "\n",
+    "print(model.conv1.weight.data.size())"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
-   "source": [
-    "# show the weight of `conv1`.\n",
-    "\n",
-    "print(model.conv1.weight.data)"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "tensor([[[[ 1.5338e-01, -1.1766e-01, -2.6654e-01, -2.9445e-02, -1.4650e-01],\n",
      "          [-1.8796e-01, -2.9882e-01,  6.9725e-02,  2.1561e-01,  6.5688e-02],\n",
@@ -395,18 +391,24 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# show the weight of `conv1`.\n",
+    "\n",
+    "print(model.conv1.weight.data)"
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# 2. Prepare config_list for pruning"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# we will prune 50% weights in `conv1`.\n",
    "\n",
@@ -415,20 +417,20 @@
    "    'op_types': ['Conv2d'],\n",
    "    'op_names': ['conv1']\n",
    "}]"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# 3. Choose a pruner and pruning"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# use l1filter pruner to prune the model\n",
    "\n",
@@ -438,23 +440,16 @@
    "# you need a new optimizer instead of you have used above, because NNI might modify the optimizer.\n",
    "# And of course this modified optimizer can not be used in finetuning.\n",
    "pruner = L1FilterPruner(model, config_list)"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
-   "source": [
-    "# we can find the `conv1` has been wrapped, the origin `conv1` changes to `conv1.module`.\n",
-    "# the weight of conv1 will modify by `weight * mask` in `forward()`. The initial mask is a `ones_like(weight)` tensor.\n",
-    "\n",
-    "[print('op_name: {}\\nop_type: {}\\n'.format(name, type(module))) for name, module in model.named_modules()]"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "op_name: \n",
      "op_type: <class '__main__.NaiveModel'>\n",
@@ -492,29 +487,29 @@
     ]
    },
    {
-     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[None, None, None, None, None, None, None, None, None, None, None]"
      ]
     },
+     "execution_count": 9,
     "metadata": {},
-     "execution_count": 9
+     "output_type": "execute_result"
    }
   ],
-   "metadata": {}
+   "source": [
+    "# we can find the `conv1` has been wrapped, the origin `conv1` changes to `conv1.module`.\n",
+    "# the weight of conv1 will modify by `weight * mask` in `forward()`. The initial mask is a `ones_like(weight)` tensor.\n",
+    "\n",
+    "[print('op_name: {}\\nop_type: {}\\n'.format(name, type(module))) for name, module in model.named_modules()]"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
-   "source": [
-    "# compress the model, the mask will be updated.\n",
-    "\n",
-    "pruner.compress()"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "NaiveModel(\n",
@@ -532,43 +527,44 @@
       ")"
      ]
     },
+     "execution_count": 10,
     "metadata": {},
-     "execution_count": 10
+     "output_type": "execute_result"
    }
   ],
-   "metadata": {}
+   "source": [
+    "# compress the model, the mask will be updated.\n",
+    "\n",
+    "pruner.compress()"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
-   "source": [
-    "# show the mask size of `conv1`\n",
-    "\n",
-    "print(model.conv1.weight_mask.size())"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "torch.Size([20, 1, 5, 5])\n"
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# show the mask size of `conv1`\n",
+    "\n",
+    "print(model.conv1.weight_mask.size())"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
-   "source": [
-    "# show the mask of `conv1`\n",
-    "\n",
-    "print(model.conv1.weight_mask)"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "tensor([[[[1., 1., 1., 1., 1.],\n",
      "          [1., 1., 1., 1., 1.],\n",
@@ -711,24 +707,20 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# show the mask of `conv1`\n",
+    "\n",
+    "print(model.conv1.weight_mask)"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
-   "source": [
-    "# use a dummy input to apply the sparsify.\n",
-    "\n",
-    "model(torch.rand(1, 1, 28, 28).to(device))\n",
-    "\n",
-    "# the weights of `conv1` have been sparsified.\n",
-    "\n",
-    "print(model.conv1.module.weight.data)"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "tensor([[[[ 1.5338e-01, -1.1766e-01, -2.6654e-01, -2.9445e-02, -1.4650e-01],\n",
      "          [-1.8796e-01, -2.9882e-01,  6.9725e-02,  2.1561e-01,  6.5688e-02],\n",
@@ -872,54 +864,54 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# use a dummy input to apply the sparsify.\n",
+    "\n",
+    "model(torch.rand(1, 1, 28, 28).to(device))\n",
+    "\n",
+    "# the weights of `conv1` have been sparsified.\n",
+    "\n",
+    "print(model.conv1.module.weight.data)"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
-   "source": [
-    "# export the sparsified model state to './pruned_naive_mnist_l1filter.pth'.\n",
-    "# export the mask to './mask_naive_mnist_l1filter.pth'.\n",
-    "\n",
-    "pruner.export_model(model_path='pruned_naive_mnist_l1filter.pth', mask_path='mask_naive_mnist_l1filter.pth')"
-   ],
+   "metadata": {
+    "scrolled": true
+   },
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "[2021-07-26 22:26:05] INFO (nni.compression.pytorch.compressor/MainThread) Model state_dict saved to pruned_naive_mnist_l1filter.pth\n",
      "[2021-07-26 22:26:05] INFO (nni.compression.pytorch.compressor/MainThread) Mask dict saved to mask_naive_mnist_l1filter.pth\n"
     ]
    }
   ],
-   "metadata": {
-    "scrolled": true
-   }
+   "source": [
+    "# export the sparsified model state to './pruned_naive_mnist_l1filter.pth'.\n",
+    "# export the mask to './mask_naive_mnist_l1filter.pth'.\n",
+    "\n",
+    "pruner.export_model(model_path='pruned_naive_mnist_l1filter.pth', mask_path='mask_naive_mnist_l1filter.pth')"
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# 4. Speed Up"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
-   "source": [
-    "# If you use a wrapped model, don't forget to unwrap it.\n",
-    "\n",
-    "pruner._unwrap_model()\n",
-    "\n",
-    "# the model has been unwrapped.\n",
-    "\n",
-    "print(model)"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "NaiveModel(\n",
      "  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))\n",
@@ -935,37 +927,40 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# If you use a wrapped model, don't forget to unwrap it.\n",
+    "\n",
+    "pruner._unwrap_model()\n",
+    "\n",
+    "# the model has been unwrapped.\n",
+    "\n",
+    "print(model)"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
-   "source": [
-    "from nni.compression.pytorch import ModelSpeedup\n",
-    "\n",
-    "m_speedup = ModelSpeedup(model, dummy_input=torch.rand(10, 1, 28, 28).to(device), masks_file='mask_naive_mnist_l1filter.pth')\n",
-    "m_speedup.speedup_model()"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stderr",
+     "output_type": "stream",
     "text": [
      "<ipython-input-1-0f2a9eb92f42>:22: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  x = x.view(-1, x.size()[1:].numel())\n"
     ]
    },
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "[2021-07-26 22:26:18] INFO (nni.compression.pytorch.speedup.compressor/MainThread) start to speed up the model\n",
      "[2021-07-26 22:26:18] INFO (FixMaskConflict/MainThread) {'conv1': 1, 'conv2': 1}\n"
     ]
    },
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "[2021-07-26 22:26:18] INFO (FixMaskConflict/MainThread) dim0 sparsity: 0.500000\n",
      "[2021-07-26 22:26:18] INFO (FixMaskConflict/MainThread) dim1 sparsity: 0.000000\n",
@@ -991,16 +986,16 @@
     ]
    },
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "[2021-07-26 22:26:18] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update indirect sparsity for relu3\n",
      "[2021-07-26 22:26:18] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the relu3\n"
     ]
    },
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "[2021-07-26 22:26:18] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update indirect sparsity for fc1\n",
      "[2021-07-26 22:26:18] INFO (nni.compression.pytorch.speedup.compressor/MainThread) Update the indirect sparsity for the fc1\n",
@@ -1037,21 +1032,21 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "from nni.compression.pytorch import ModelSpeedup\n",
+    "\n",
+    "m_speedup = ModelSpeedup(model, dummy_input=torch.rand(10, 1, 28, 28).to(device), masks_file='mask_naive_mnist_l1filter.pth')\n",
+    "m_speedup.speedup_model()"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
-   "source": [
-    "# the `conv1` has been replace from `Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))` to `Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))`\n",
-    "# and the following layer `conv2` has also changed because the input channel of `conv2` should aware the output channel of `conv1`.\n",
-    "\n",
-    "print(model)"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "NaiveModel(\n",
      "  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))\n",
@@ -1067,24 +1062,21 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# the `conv1` has been replace from `Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))` to `Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))`\n",
+    "# and the following layer `conv2` has also changed because the input channel of `conv2` should aware the output channel of `conv1`.\n",
+    "\n",
+    "print(model)"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
-   "source": [
-    "# finetune the model to recover the accuracy.\n",
-    "\n",
-    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
-    "\n",
-    "for epoch in range(0, 1):\n",
-    "    trainer(model, optimizer, criterion, epoch)\n",
-    "    evaluator(model)"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "Train Epoch: 0 [0/60000 (0%)]\tLoss: 0.306930\n",
      "Train Epoch: 0 [6400/60000 (11%)]\tLoss: 0.045807\n",
@@ -1102,47 +1094,49 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# finetune the model to recover the accuracy.\n",
+    "\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n",
+    "\n",
+    "for epoch in range(0, 1):\n",
+    "    trainer(model, optimizer, criterion, epoch)\n",
+    "    evaluator(model)"
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# 5. Prepare config_list for quantization"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "config_list = [{\n",
-    "    'quant_types': ['weight'],\n",
-    "    'quant_bits': {'weight': 8},\n",
+    "    'quant_types': ['weight', 'input'],\n",
+    "    'quant_bits': {'weight': 8, 'input': 8},\n",
    "    'op_names': ['conv1', 'conv2']\n",
    "}]"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# 6. Choose a quantizer and quantizing"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
-   "source": [
-    "from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer\n",
-    "\n",
-    "quantizer = QAT_Quantizer(model, config_list, optimizer)\n",
-    "quantizer.compress()"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "NaiveModel(\n",
@@ -1162,26 +1156,26 @@
       ")"
      ]
     },
+     "execution_count": 20,
     "metadata": {},
-     "execution_count": 20
+     "output_type": "execute_result"
    }
   ],
-   "metadata": {}
+   "source": [
+    "from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer\n",
+    "\n",
+    "quantizer = QAT_Quantizer(model, config_list, optimizer)\n",
+    "quantizer.compress()"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
-   "source": [
-    "# finetune the model for calibration.\n",
-    "\n",
-    "for epoch in range(0, 1):\n",
-    "    trainer(model, optimizer, criterion, epoch)\n",
-    "    evaluator(model)"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "Train Epoch: 0 [0/60000 (0%)]\tLoss: 0.004960\n",
      "Train Epoch: 0 [6400/60000 (11%)]\tLoss: 0.036269\n",
@@ -1199,28 +1193,28 @@
     ]
    }
   ],
-   "metadata": {}
+   "source": [
+    "# finetune the model for calibration.\n",
+    "\n",
+    "for epoch in range(0, 1):\n",
+    "    trainer(model, optimizer, criterion, epoch)\n",
+    "    evaluator(model)"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
-   "source": [
-    "# export the sparsified model state to './quantized_naive_mnist_l1filter.pth'.\n",
-    "# export the calibration config to './calibration_naive_mnist_l1filter.pth'.\n",
-    "\n",
-    "quantizer.export_model(model_path='quantized_naive_mnist_l1filter.pth', calibration_path='calibration_naive_mnist_l1filter.pth')"
-   ],
+   "metadata": {},
   "outputs": [
    {
-     "output_type": "stream",
     "name": "stdout",
+     "output_type": "stream",
     "text": [
      "[2021-07-26 22:34:41] INFO (nni.compression.pytorch.compressor/MainThread) Model state_dict saved to quantized_naive_mnist_l1filter.pth\n",
      "[2021-07-26 22:34:41] INFO (nni.compression.pytorch.compressor/MainThread) Mask dict saved to calibration_naive_mnist_l1filter.pth\n"
     ]
    },
    {
-     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "{'conv1': {'weight_bit': 8,\n",
@@ -1231,30 +1225,36 @@
       "  'tracked_max_input': 4.246923446655273}}"
      ]
     },
+     "execution_count": 22,
     "metadata": {},
-     "execution_count": 22
+     "output_type": "execute_result"
    }
   ],
-   "metadata": {}
+   "source": [
+    "# export the sparsified model state to './quantized_naive_mnist_l1filter.pth'.\n",
+    "# export the calibration config to './calibration_naive_mnist_l1filter.pth'.\n",
+    "\n",
+    "quantizer.export_model(model_path='quantized_naive_mnist_l1filter.pth', calibration_path='calibration_naive_mnist_l1filter.pth')"
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# 7. Speed Up"
-   ],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# speed up with tensorRT\n",
    "\n",
    "engine = ModelSpeedupTensorRT(model, (32, 1, 28, 28), config=calibration_config, batchsize=32)\n",
    "engine.compress()"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  }
 ],
 "metadata": {

--- a/docs/zh_CN/Compression/Overview.rst
+++ b/docs/zh_CN/Compression/Overview.rst
 .. 37577199d91c137b881450f825f38fa2

 使用 NNI 进行模型压缩
-==========================
+====================

 .. contents::

-随着更多层和节点大型神经网络的使用，降低其存储和计算成本变得至关重要，尤其是对于某些实时应用程序。 模型压缩可用于解决此问题。
+目前的大型神经网络较之以往具有更多的层和节点，而如何降低其存储和计算成本是一个重要的话题，尤其是针对于那些需要实时响应的应用程序。
+模型压缩的相关方法可以用于解决这些问题。

-NNI 的模型压缩工具包，提供了最先进的模型压缩算法和策略，帮助压缩并加速模型。 NNI 模型压缩支持的主要功能有：
+NNI 的模型压缩工具包，提供了最先进的模型压缩算法和策略，帮助压缩并加速模型。NNI 模型压缩支持的主要功能有：


 * 支持多种流行的剪枝和量化算法。
 * 通过 NNI 强大的自动调优功能，可使用最先进的策略来自动化模型的剪枝和量化过程。
 * 加速压缩的模型，使其在推理时有更低的延迟，同时文件也会变小。
-* 提供优化且易用的压缩工具，帮助用户深入了解压缩过程和结果。
+* 提供友好易用的压缩工具，帮助用户深入了解压缩过程和结果。
 * 提供简洁的接口，帮助用户实现自己的压缩算法。


 压缩流水线
--------------------
+----------

 .. image:: ../../img/compression_flow.jpg
   :target: ../../img/compression_flow.jpg
   :alt: 

-NNI的模型压缩流水线。 对于压缩预训练的模型，剪枝和量化可以单独使用或结合使用。 
+NNI整体的模型压缩流水线图。对于压缩一个预训练的模型，剪枝和量化可以单独使用或结合使用。 

 .. note::
-  NNI 压缩算法并不意味着压缩模型，NNI 的加速工具才可以真正压缩模型并减少延迟。 要获得真正压缩后的模型，用户应该进行 `模型加速 <./ModelSpeedup.rst>`__。 * 注意，PyTorch 和 TensorFlow 有统一的 API 接口，当前仅支持 PyTorch 版本，未来会提供 TensorFlow 的支持。
+  NNI 压缩算法并不意味着真正使模型变小或者减少延迟，NNI 的加速工具才可以真正压缩模型并减少延迟。要获得真正压缩后的模型，用户应该进行 `模型加速 <./ModelSpeedup.rst>`__。* 注意，PyTorch 和 TensorFlow 有统一的 API 接口，当前仅支持 PyTorch 版本，未来会提供 TensorFlow 的支持。

 支持的算法
--------------------
+----------

 包括剪枝和量化算法。

 剪枝算法
-^^^^^^^^^^^^^^^^^^
+^^^^^^^^

 剪枝算法通过删除冗余权重或层通道来压缩原始网络，从而降低模型复杂性并解决过拟合问题。

@@ -48,41 +49,43 @@ NNI的模型压缩流水线。 对于压缩预训练的模型，剪枝和量化
   * - `Level Pruner <Pruner.rst#level-pruner>`__
     - 根据权重的绝对值，来按比例修剪权重。
   * - `AGP Pruner <../Compression/Pruner.rst#agp-pruner>`__
-     - 自动的逐步剪枝（是否剪枝的判断：基于对模型剪枝的效果）`参考论文 <https://arxiv.org/abs/1710.01878>`__
+     - 自动的逐步剪枝（To prune, or not to prune: exploring the efficacy of pruning for model compression）`参考论文 <https://arxiv.org/abs/1710.01878>`__
   * - `Lottery Ticket Pruner <../Compression/Pruner.rst#lottery-ticket>`__
     - "The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks" 提出的剪枝过程。 它会反复修剪模型。 `参考论文 <https://arxiv.org/abs/1803.03635>`__
   * - `FPGM Pruner <../Compression/Pruner.rst#fpgm-pruner>`__
     - Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration `参考论文 <https://arxiv.org/pdf/1811.00250.pdf>`__
   * - `L1Filter Pruner <../Compression/Pruner.rst#l1filter-pruner>`__
-     - 在卷积层中具有最小 L1 权重规范的剪枝滤波器（用于 Efficient Convnets 的剪枝滤波器） `参考论文 <https://arxiv.org/abs/1608.08710>`__
+     - 在卷积层中具有最小 L1 权重规范的剪枝滤波器。（Pruning Filters for Efficient Convnets） `参考论文 <https://arxiv.org/abs/1608.08710>`__
   * - `L2Filter Pruner <../Compression/Pruner.rst#l2filter-pruner>`__
-     - 在卷积层中具有最小 L2 权重规范的剪枝滤波器
+     - 在卷积层中具有最小 L2 权重规范的剪枝滤波器。
   * - `ActivationAPoZRankFilterPruner <../Compression/Pruner.rst#activationapozrankfilter-pruner>`__
-     - 基于指标 APoZ（平均百分比零）的剪枝滤波器，该指标测量（卷积）图层激活中零的百分比。 `参考论文 <https://arxiv.org/abs/1607.03250>`__
+     - 基于指标 APoZ（平均百分比零）的剪枝滤波器，该指标测量（卷积）图层激活值中零的百分比。 `参考论文 <https://arxiv.org/abs/1607.03250>`__
   * - `ActivationMeanRankFilterPruner <../Compression/Pruner.rst#activationmeanrankfilter-pruner>`__
-     - 基于计算输出激活最小平均值指标的剪枝滤波器
+     - 基于计算输出激活最小平均值指标的剪枝滤波器。
   * - `Slim Pruner <../Compression/Pruner.rst#slim-pruner>`__
-     - 通过修剪 BN 层中的缩放因子来修剪卷积层中的通道 (Learning Efficient Convolutional Networks through Network Slimming) `参考论文 <https://arxiv.org/abs/1708.06519>`__
+     - 通过修剪 BN 层中的缩放因子来修剪卷积层中的通道。 (Learning Efficient Convolutional Networks through Network Slimming) `参考论文 <https://arxiv.org/abs/1708.06519>`__
   * - `TaylorFO Pruner <../Compression/Pruner.rst#taylorfoweightfilter-pruner>`__
-     - 基于一阶泰勒展开的权重对滤波器剪枝 (Importance Estimation for Neural Network Pruning) `参考论文 <http://jankautz.com/publications/Importance4NNPruning_CVPR19.pdf>`__
+     - 基于一阶泰勒展开的权重对滤波器剪枝。 (Importance Estimation for Neural Network Pruning) `参考论文 <http://jankautz.com/publications/Importance4NNPruning_CVPR19.pdf>`__
   * - `ADMM Pruner <../Compression/Pruner.rst#admm-pruner>`__
-     - 基于 ADMM 优化技术的剪枝 `参考论文 <https://arxiv.org/abs/1804.03294>`__
+     - 基于 ADMM 优化技术的剪枝。 `参考论文 <https://arxiv.org/abs/1804.03294>`__
   * - `NetAdapt Pruner <../Compression/Pruner.rst#netadapt-pruner>`__
-     - 在满足计算资源预算的情况下，对预训练的网络迭代剪枝 `参考论文 <https://arxiv.org/abs/1804.03230>`__
+     - 在满足计算资源预算的情况下，对预训练的网络迭代剪枝。 `参考论文 <https://arxiv.org/abs/1804.03230>`__
   * - `SimulatedAnnealing Pruner <../Compression/Pruner.rst#simulatedannealing-pruner>`__
-     - 通过启发式的模拟退火算法进行自动剪枝 `参考论文 <https://arxiv.org/abs/1907.03141>`__
+     - 通过启发式的模拟退火算法进行自动剪枝。 `参考论文 <https://arxiv.org/abs/1907.03141>`__
   * - `AutoCompress Pruner <../Compression/Pruner.rst#autocompress-pruner>`__
-     - 通过迭代调用 SimulatedAnnealing Pruner 和 ADMM Pruner 进行自动剪枝 `参考论文 - <https://arxiv.org/abs/1907.03141>`__
+     - 通过迭代调用 SimulatedAnnealing Pruner 和 ADMM Pruner 进行自动剪枝。 `参考论文 - <https://arxiv.org/abs/1907.03141>`__
   * - `AMC Pruner <../Compression/Pruner.rst#amc-pruner>`__
-     - AMC：移动设备的模型压缩和加速 `参考论文 <https://arxiv.org/pdf/1802.03494.pdf>`__
+     - AMC: AutoML for Model Compression and Acceleration on Mobile Devices `参考论文 <https://arxiv.org/pdf/1802.03494.pdf>`__
+   * - `Transformer Head Pruner <../Compression/Pruner.rst#transformer-head-pruner>`__
+     - 针对transformer中的注意力头的剪枝.


 参考此 :githublink:`基准测试 <../CommunitySharings/ModelCompressionComparison.rst>` 来查看这些剪枝器在一些基准问题上的表现。

 量化算法
-^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^

-量化算法通过减少表示权重或激活所需的精度位数来压缩原始网络，这可以减少计算和推理时间。
+量化算法通过减少表示权重或激活函数所需的精度位数来压缩原始网络，这可以减少计算和推理时间。

 .. list-table::
   :header-rows: 1
@@ -91,36 +94,38 @@ NNI的模型压缩流水线。 对于压缩预训练的模型，剪枝和量化
   * - 名称
     - 算法简介
   * - `Naive Quantizer <../Compression/Quantizer.rst#naive-quantizer>`__
-     - 默认将权重量化为 8 位
+     - 默认将权重量化为 8 位。
   * - `QAT Quantizer <../Compression/Quantizer.rst#qat-quantizer>`__
-     - 为 Efficient Integer-Arithmetic-Only Inference 量化并训练神经网络。 `参考论文 <http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf>`__
+     - Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference. `参考论文 <http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf>`__
   * - `DoReFa Quantizer <../Compression/Quantizer.rst#dorefa-quantizer>`__
-     - DoReFa-Net: 通过低位宽的梯度算法来训练低位宽的卷积神经网络。 `参考论文 <https://arxiv.org/abs/1606.06160>`__
+     - DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients. `参考论文 <https://arxiv.org/abs/1606.06160>`__
   * - `BNN Quantizer <../Compression/Quantizer.rst#bnn-quantizer>`__
-     - 二进制神经网络：使用权重和激活限制为 +1 或 -1 的深度神经网络。 `参考论文 <https://arxiv.org/abs/1602.02830>`__
+     - Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1. `参考论文 <https://arxiv.org/abs/1602.02830>`__
   * - `LSQ Quantizer <../Compression/Quantizer.rst#lsq-quantizer>`__
-     - 可学习的步长量化。 `参考论文 <https://arxiv.org/pdf/1902.08153.pdf>`__
+     - Learned step size quantization. `参考论文 <https://arxiv.org/pdf/1902.08153.pdf>`__
+   * - `Observer Quantizer <../Compression/Quantizer.rst#observer-quantizer>`__
+     - Post training quantizaiton. 使用 observer 在校准期间收集量化信息。


 模型加速
-------------
+--------

-模型压缩的目的是减少推理延迟和模型大小。 但现有的模型压缩算法主要通过模拟的方法来检查压缩模型性能（如精度）。例如，剪枝算法中使用掩码，而量化算法中量化值仍然是以 32 位浮点数来存储。 只要给出这些算法产生的掩码和量化位，NNI 可真正的加速模型。 基于掩码的模型加速详细教程可以在 `这里 <./ModelSpeedup.rst>`__ 找到。混合精度量化的详细教程可以在 `这里 <./QuantizationSpeedup.rst>`__ 找到。
+模型压缩的目的是减少推理延迟和模型大小。但现有的模型压缩算法主要通过模拟的方法来检查压缩模型性能（如精度）。例如，剪枝算法中使用掩码，而量化算法中量化值仍然是以 32 位浮点数来存储。只要给出这些算法产生的掩码和量化位，NNI 可真正的加速模型。基于掩码的模型加速详细教程可以在 `这里 <./ModelSpeedup.rst>`__ 找到。混合精度量化的详细教程可以在 `这里 <./QuantizationSpeedup.rst>`__ 找到。


 压缩工具
---------------------
+--------

-压缩工具包括了一些有用的工具，能帮助用户理解并分析要压缩的模型。 例如，可检查每层对剪枝的敏感度。 可很容易的计算模型的 FLOPs 和参数数量。 `点击这里 <./CompressionUtils.rst>`__，查看压缩工具的完整列表。
+压缩工具包括了一些有用的工具，能帮助用户理解并分析要压缩的模型。例如，可检查每层对剪枝的敏感度。可很容易的计算模型的 FLOPs 和参数数量。`点击这里 <./CompressionUtils.rst>`__，查看压缩工具的完整列表。

 高级用法
--------------
+--------

-NNI 模型压缩提供了简洁的接口，用于自定义新的压缩算法。 接口的设计理念是，将框架相关的实现细节包装起来，让用户能聚焦于压缩逻辑。 用户可以进一步了解我们的压缩框架，并根据我们的框架定制新的压缩算法（剪枝算法或量化算法）。 此外，还可利用 NNI 的自动调参功能来自动的压缩模型。 参考 `这里 <./advanced.rst>`__ 了解更多细节。
+NNI 模型压缩提供了简洁的接口，用于自定义新的压缩算法。接口的设计理念是，将框架相关的实现细节包装起来，让用户能聚焦于压缩逻辑。用户可以进一步了解我们的压缩框架，并根据我们的框架定制新的压缩算法（剪枝算法或量化算法）。此外，还可利用 NNI 的自动调参功能来自动的压缩模型。参考 `这里 <./advanced.rst>`__ 了解更多细节。


 参考和反馈
----------------------
+----------

 * 在Github 中 `提交此功能的 Bug <https://github.com/microsoft/nni/issues/new?template=bug-report.rst>`__
 * 在Github 中 `提交新功能或请求改进 <https://github.com/microsoft/nni/issues/new?template=enhancement.rst>`__

--- a/docs/zh_CN/Compression/QuickStart.rst
+++ b/docs/zh_CN/Compression/QuickStart.rst
-.. b7c7ceacdaabfcf383be1e74d067098d
+.. a67033195635ebcd510103eab8703b6a

 快速入门
 ===========
@@ -9,7 +9,10 @@
    Notebook Example <compression_pipeline_example>


-模型压缩通常包括三个阶段：1）预训练模型，2）压缩模型，3）微调模型。 NNI 主要关注于第二阶段，并为模型压缩提供非常简单的 API。 遵循本指南，快速了解如何使用 NNI 压缩模型。 NNI 主要关注于第二阶段，并为模型压缩提供非常简单的 API。 恭喜！ 您已经通过 NNI 压缩了您的第一个模型。 更深入地了解 NNI 中的模型压缩，请查看 `Tutorial <./Tutorial.rst>`__。 
+模型压缩通常包括三个阶段：1）预训练模型，2）压缩模型，3）微调模型。 NNI 主要关注于第二阶段，并为模型压缩提供易于使用的 API。
+遵循本指南，您将快速了解如何使用 NNI 来压缩模型。
+更深入地了解 NNI 中的模型压缩模块，请查看 `Tutorial <./Tutorial.rst>`__。
+提供了一个在 Jupyter notebook 中进行完整的模型压缩流程的 `示例 <./compression_pipeline_example.rst>`__，参考 :githublink:`代码 <examples/notebooks/compression_pipeline_example.ipynb>`。

 模型剪枝
 -------------
@@ -19,7 +22,7 @@
 Step1. 编写配置
 ^^^^^^^^^^^^^^^^^^^^^^^^^^

-编写配置来指定要剪枝的层。以下配置表示剪枝所有的 ``default`` 操作，稀疏度设为 0.5，其它层保持不变。
+编写配置来指定要剪枝的层。以下配置表示剪枝所有的 ``default`` 层，稀疏度设为 0.5，其它层保持不变。

 .. code-block:: python

@@ -28,34 +31,36 @@ Step1. 编写配置
       'op_types': ['default'],
   }]

-配置说明在 `这里 <./Tutorial.rst#quantization-specific-keys>`__。 注意，不同的 Pruner 可能有自定义的配置字段，例如，AGP Pruner 有 ``start_epoch``。 详情参考每个 Pruner 的 `用法 <./Pruner.rst>`__，来调整相应的配置。
+配置说明在 `这里 <./Tutorial.rst#specify-the-configuration>`__。注意，不同的 Pruner 可能有自定义的配置字段。
+详情参考每个 Pruner 的 `具体用法 <./Pruner.rst>`__，来调整相应的配置。

 Step2. 选择 Pruner 来压缩模型
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-首先，使用模型来初始化 Pruner，并将配置作为参数传入，然后调用 ``compress()`` 来压缩模型。 请注意，有些算法可能会检查压缩的梯度，因此我们还定义了一个优化器并传递给 Pruner。
+首先，使用模型来初始化 Pruner，并将配置作为参数传入，然后调用 ``compress()`` 来压缩模型。
+请注意，有些算法可能会检查训练过程中的梯度，因此我们可能会定义一组 trainer, optimizer, criterion 并传递给 Pruner。

 .. code-block:: python

   from nni.algorithms.compression.pytorch.pruning import LevelPruner

-   pruner = LevelPruner(model, config_list, optimizer_finetune)
+   pruner = LevelPruner(model, config_list)
   model = pruner.compress()

-然后，使用正常的训练方法来训练模型 （如，SGD），剪枝在训练过程中是透明的。 有些 Pruner（如 L1FilterPruner、FPGMPruner）在开始时修剪一次，下面的训练可以看作是微调。 有些 Pruner（例如AGPPruner）会迭代的对模型剪枝，在训练过程中逐步修改掩码。
+然后，使用正常的训练方法来训练模型 （如，SGD），剪枝在训练过程中是透明的。
+有些 Pruner（如 L1FilterPruner、FPGMPruner）在开始时修剪一次，下面的训练可以看作是微调。
+有些 Pruner（例如AGPPruner）会迭代的对模型剪枝，在训练过程中逐步修改掩码。

-注意，``pruner.compress`` 只会在模型权重上直接增加掩码，不包括调优的逻辑。 如果要想调优压缩后的模型，需要在 ``pruner.compress`` 后增加调优的逻辑。
+如果使用 Pruner 进行迭代剪枝，或者剪枝过程中需要训练或者推理，则需要将 finetune 逻辑传到 Pruner 中。

 例如：

 .. code-block:: python

-   for epoch in range(1, args.epochs + 1):
-        pruner.update_epoch(epoch)
-        train(args, model, device, train_loader, optimizer_finetune, epoch)
-        test(model, device, test_loader)
+   from nni.algorithms.compression.pytorch.pruning import AGPPruner

-更多关于微调的 API 在 `这里 <./Tutorial.rst#api>`__。 
+   pruner = AGPPruner(model, config_list, optimizer, trainer, criterion, num_iterations=10, epochs_per_iteration=1, pruning_algorithm='level')
+   model = pruner.compress()


 Step3. 导出压缩结果
@@ -83,16 +88,21 @@ Step1. 编写配置
 .. code-block:: python

   config_list = [{
-       'quant_types': ['weight'],
+       'quant_types': ['weight', 'input'],
       'quant_bits': {
           'weight': 8,
+           'input': 8,
       }, # 这里可以仅使用 `int`，因为所有 `quan_types` 使用了一样的位长，参考下方 `ReLu6` 配置。
-       'op_types':['Conv2d', 'Linear']
+       'op_types':['Conv2d', 'Linear'],
+       'quant_dtype': 'int',
+       'quant_scheme': 'per_channel_symmetric'
   }, {
       'quant_types': ['output'],
       'quant_bits': 8,
       'quant_start_step': 7000,
-       'op_types':['ReLU6']
+       'op_types':['ReLU6'],
+       'quant_dtype': 'uint',
+       'quant_scheme': 'per_tensor_affine'
   }]

 配置说明在 `这里 <./Tutorial.rst#quantization-specific-keys>`__。

--- a/docs/zh_CN/Compression/compression_pipeline_example.ipynb
+++ b/docs/zh_CN/Compression/compression_pipeline_example.ipynb
+./en_US/Compression/compression_pipeline_example.ipynb
\ No newline at end of file