Unverified Commit a911b856 authored by Yuge Zhang's avatar Yuge Zhang Committed by GitHub
Browse files

Resolve conflicts for #4760 (#4762)

parent 14d2966b
"""
Start and Manage a New Experiment
=================================
"""
# %%
# Configure Search Space
# ----------------------
search_space = {
"C": {"_type": "quniform", "_value": [0.1, 1, 0.1]},
"kernel": {"_type": "choice", "_value": ["linear", "rbf", "poly", "sigmoid"]},
"degree": {"_type": "choice", "_value": [1, 2, 3, 4]},
"gamma": {"_type": "quniform", "_value": [0.01, 0.1, 0.01]},
"coef0": {"_type": "quniform", "_value": [0.01, 0.1, 0.01]}
}
# %%
# Configure Experiment
# --------------------
from nni.experiment import Experiment
experiment = Experiment('local')
experiment.config.experiment_name = 'Example'
experiment.config.trial_concurrency = 2
experiment.config.max_trial_number = 10
experiment.config.search_space = search_space
experiment.config.trial_command = 'python scripts/trial_sklearn.py'
experiment.config.trial_code_directory = './'
experiment.config.tuner.name = 'TPE'
experiment.config.tuner.class_args['optimize_mode'] = 'maximize'
experiment.config.training_service.use_active_gpu = True
# %%
# Start Experiment
# ----------------
experiment.start(8080)
# %%
# Experiment View & Control
# -------------------------
#
# View the status of experiment.
experiment.get_status()
# %%
# Wait until at least one trial finishes.
import time
for _ in range(10):
stats = experiment.get_job_statistics()
if any(stat['trialJobStatus'] == 'SUCCEEDED' for stat in stats):
break
time.sleep(10)
# %%
# Export the experiment data.
experiment.export_data()
# %%
# Get metric of jobs
experiment.get_job_metrics()
# %%
# Stop Experiment
# ---------------
experiment.stop()
"""
Customize Basic Pruner
======================
Users can easily customize a basic pruner in NNI. A large number of basic modules have been provided and can be reused.
Follow the NNI pruning interface, users only need to focus on their creative parts without worrying about other regular modules.
In this tutorial, we show how to customize a basic pruner.
Concepts
--------
NNI abstracts the basic pruning process into three steps, collecting data, calculating metrics, allocating sparsity.
Most pruning algorithms rely on a metric to decide where should be pruned. Using L1 norm pruner as an example,
the first step is collecting model weights, the second step is calculating L1 norm for weight per output channel,
the third step is ranking L1 norm metric and masking the output channels that have small L1 norm.
In NNI basic pruner, these three step is implement as ``DataCollector``, ``MetricsCalculator`` and ``SparsityAllocator``.
- ``DataCollector``: This module take pruner as initialize parameter.
It will get the relevant information of the model from the pruner,
and sometimes it will also hook the model to get input, output or gradient of a layer or a tensor.
It can also patch optimizer if some special steps need to be executed before or after ``optimizer.step()``.
- ``MetricsCalculator``: This module will take the data collected from the ``DataCollector``,
then calculate the metrics. The metric shape is usually reduced from the data shape.
The ``dim`` taken by ``MetricsCalculator`` means which dimension will be kept after calculate metrics.
i.e., the collected data shape is (10, 20, 30), and the ``dim`` is 1, then the dimension-1 will be kept,
the output metrics shape should be (20,).
- ``SparsityAllocator``: This module take the metrics and generate the masks.
Different ``SparsityAllocator`` has different masks generation strategies.
A common and simple strategy is sorting the metrics' values and calculating a threshold according to the configured sparsity,
mask the positions which metric value smaller than the threshold.
The ``dim`` taken by ``SparsityAllocator`` means the metrics are for which dimension, the mask will be expanded to weight shape.
i.e., the metric shape is (20,), the corresponding layer weight shape is (20, 40), and the ``dim`` is 0.
``SparsityAllocator`` will first generate a mask with shape (20,), then expand this mask to shape (20, 40).
Simple Example: Customize a Block-L1NormPruner
----------------------------------------------
NNI already have L1NormPruner, but for the reason of reproducing the paper and reducing user configuration items,
it only support pruning layer output channels. In this example, we will customize a pruner that supports block granularity for Linear.
Note that you don't need to implement all these three kinds of tools for each time,
NNI supports many predefined tools, and you can directly use these to customize your own pruner.
This is a tutorial so we show how to define all these three kinds of pruning tools.
Customize the pruning tools used by the pruner at first.
"""
import torch
from nni.algorithms.compression.v2.pytorch.pruning.basic_pruner import BasicPruner
from nni.algorithms.compression.v2.pytorch.pruning.tools import (
DataCollector,
MetricsCalculator,
SparsityAllocator
)
# This data collector collects weight in wrapped module as data.
# The wrapped module is the module configured in pruner's config_list.
# This implementation is similar as nni.algorithms.compression.v2.pytorch.pruning.tools.WeightDataCollector
class WeightDataCollector(DataCollector):
def collect(self):
data = {}
# get_modules_wrapper will get all the wrapper in the compressor (pruner),
# it returns a dict with format {wrapper_name: wrapper},
# use wrapper.module to get the wrapped module.
for _, wrapper in self.compressor.get_modules_wrapper().items():
data[wrapper.name] = wrapper.module.weight.data
# return {wrapper_name: weight_data}
return data
class BlockNormMetricsCalculator(MetricsCalculator):
def __init__(self, block_sparse_size):
# Because we will keep all dimension with block granularity, so fix ``dim=None``,
# means all dimensions will be kept.
super().__init__(dim=None, block_sparse_size=block_sparse_size)
def calculate_metrics(self, data):
data_length = len(self.block_sparse_size)
reduce_unfold_dims = list(range(data_length, 2 * data_length))
metrics = {}
for name, t in data.items():
# Unfold t as block size, and calculate L1 Norm for each block.
for dim, size in enumerate(self.block_sparse_size):
t = t.unfold(dim, size, size)
metrics[name] = t.norm(dim=reduce_unfold_dims, p=1)
# return {wrapper_name: block_metric}
return metrics
# This implementation is similar as nni.algorithms.compression.v2.pytorch.pruning.tools.NormalSparsityAllocator
class BlockSparsityAllocator(SparsityAllocator):
def __init__(self, pruner, block_sparse_size):
super().__init__(pruner, dim=None, block_sparse_size=block_sparse_size, continuous_mask=True)
def generate_sparsity(self, metrics):
masks = {}
for name, wrapper in self.pruner.get_modules_wrapper().items():
# wrapper.config['total_sparsity'] can get the configured sparsity ratio for this wrapped module
sparsity_rate = wrapper.config['total_sparsity']
# get metric for this wrapped module
metric = metrics[name]
# mask the metric with old mask, if the masked position need never recover,
# just keep this is ok if you are new in NNI pruning
if self.continuous_mask:
metric *= self._compress_mask(wrapper.weight_mask)
# convert sparsity ratio to prune number
prune_num = int(sparsity_rate * metric.numel())
# calculate the metric threshold
threshold = torch.topk(metric.view(-1), prune_num, largest=False)[0].max()
# generate mask, keep the metric positions that metric values greater than the threshold
mask = torch.gt(metric, threshold).type_as(metric)
# expand the mask to weight size, if the block is masked, this block will be filled with zeros,
# otherwise filled with ones
masks[name] = self._expand_mask(name, mask)
# merge the new mask with old mask, if the masked position need never recover,
# just keep this is ok if you are new in NNI pruning
if self.continuous_mask:
masks[name]['weight'] *= wrapper.weight_mask
return masks
# %%
# Customize the pruner.
class BlockL1NormPruner(BasicPruner):
def __init__(self, model, config_list, block_sparse_size):
self.block_sparse_size = block_sparse_size
super().__init__(model, config_list)
# Implement reset_tools is enough for this pruner.
def reset_tools(self):
if self.data_collector is None:
self.data_collector = WeightDataCollector(self)
else:
self.data_collector.reset()
if self.metrics_calculator is None:
self.metrics_calculator = BlockNormMetricsCalculator(self.block_sparse_size)
if self.sparsity_allocator is None:
self.sparsity_allocator = BlockSparsityAllocator(self, self.block_sparse_size)
# %%
# Try this pruner.
# Define a simple model.
class TestModel(torch.nn.Module):
def __init__(self) -> None:
super().__init__()
self.fc1 = torch.nn.Linear(4, 8)
self.fc2 = torch.nn.Linear(8, 4)
def forward(self, x):
return self.fc2(self.fc1(x))
model = TestModel()
config_list = [{'op_types': ['Linear'], 'total_sparsity': 0.5}]
# use 2x2 block
_, masks = BlockL1NormPruner(model, config_list, [2, 2]).compress()
# show the generated masks
print('fc1 masks:\n', masks['fc1']['weight'])
print('fc2 masks:\n', masks['fc2']['weight'])
# %%
# This time we successfully define a new pruner with pruning block granularity!
# Note that we don't put validation logic in this example, like ``_validate_config_before_canonical``,
# but for a robust implementation, we suggest you involve the validation logic.
"""
Pruning Quickstart
==================
Model pruning is a technique to reduce the model size and computation by reducing model weight size or intermediate state size.
There are three common practices for pruning a DNN model:
#. Pre-training a model -> Pruning the model -> Fine-tuning the pruned model
#. Pruning a model during training (i.e., pruning aware training) -> Fine-tuning the pruned model
#. Pruning a model -> Training the pruned model from scratch
NNI supports all of the above pruning practices by working on the key pruning stage.
Following this tutorial for a quick look at how to use NNI to prune a model in a common practice.
"""
# %%
# Preparation
# -----------
#
# In this tutorial, we use a simple model and pre-trained on MNIST dataset.
# If you are familiar with defining a model and training in pytorch, you can skip directly to `Pruning Model`_.
import torch
import torch.nn.functional as F
from torch.optim import SGD
from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device
# define the model
model = TorchModel().to(device)
# show the model structure, note that pruner will wrap the model layer.
print(model)
# %%
# define the optimizer and criterion for pre-training
optimizer = SGD(model.parameters(), 1e-2)
criterion = F.nll_loss
# pre-train and evaluate the model on MNIST dataset
for epoch in range(3):
trainer(model, optimizer, criterion)
evaluator(model)
# %%
# Pruning Model
# -------------
#
# Using L1NormPruner to prune the model and generate the masks.
# Usually, a pruner requires original model and ``config_list`` as its inputs.
# Detailed about how to write ``config_list`` please refer :doc:`compression config specification <../compression/compression_config_list>`.
#
# The following `config_list` means all layers whose type is `Linear` or `Conv2d` will be pruned,
# except the layer named `fc3`, because `fc3` is `exclude`.
# The final sparsity ratio for each layer is 50%. The layer named `fc3` will not be pruned.
config_list = [{
'sparsity_per_layer': 0.5,
'op_types': ['Linear', 'Conv2d']
}, {
'exclude': True,
'op_names': ['fc3']
}]
# %%
# Pruners usually require `model` and `config_list` as input arguments.
from nni.compression.pytorch.pruning import L1NormPruner
pruner = L1NormPruner(model, config_list)
# show the wrapped model structure, `PrunerModuleWrapper` have wrapped the layers that configured in the config_list.
print(model)
# %%
# compress the model and generate the masks
_, masks = pruner.compress()
# show the masks sparsity
for name, mask in masks.items():
print(name, ' sparsity : ', '{:.2}'.format(mask['weight'].sum() / mask['weight'].numel()))
# %%
# Speedup the original model with masks, note that `ModelSpeedup` requires an unwrapped model.
# The model becomes smaller after speedup,
# and reaches a higher sparsity ratio because `ModelSpeedup` will propagate the masks across layers.
# need to unwrap the model, if the model is wrapped before speedup
pruner._unwrap_model()
# speedup the model, for more information about speedup, please refer :doc:`pruning_speedup`.
from nni.compression.pytorch.speedup import ModelSpeedup
ModelSpeedup(model, torch.rand(3, 1, 28, 28).to(device), masks).speedup_model()
# %%
# the model will become real smaller after speedup
print(model)
# %%
# Fine-tuning Compacted Model
# ---------------------------
# Note that if the model has been sped up, you need to re-initialize a new optimizer for fine-tuning.
# Because speedup will replace the masked big layers with dense small ones.
optimizer = SGD(model.parameters(), 1e-2)
for epoch in range(3):
trainer(model, optimizer, criterion)
"""
Speedup Model with Mask
========================
Introduction
------------
Pruning algorithms usually use weight masks to simulate the real pruning. Masks can be used
to check model performance of a specific pruning (or sparsity), but there is no real speedup.
Since model speedup is the ultimate goal of model pruning, we try to provide a tool to users
to convert a model to a smaller one based on user provided masks (the masks come from the
pruning algorithms).
There are two types of pruning. One is fine-grained pruning, it does not change the shape of weights,
and input/output tensors. Sparse kernel is required to speedup a fine-grained pruned layer.
The other is coarse-grained pruning (e.g., channels), shape of weights and input/output tensors usually change due to such pruning.
To speedup this kind of pruning, there is no need to use sparse kernel, just replace the pruned layer with smaller one.
Since the support of sparse kernels in community is limited,
we only support the speedup of coarse-grained pruning and leave the support of fine-grained pruning in future.
Design and Implementation
-------------------------
To speedup a model, the pruned layers should be replaced, either replaced with smaller layer for coarse-grained mask,
or replaced with sparse kernel for fine-grained mask. Coarse-grained mask usually changes the shape of weights or input/output tensors,
thus, we should do shape inference to check are there other unpruned layers should be replaced as well due to shape change.
Therefore, in our design, there are two main steps: first, do shape inference to find out all the modules that should be replaced;
second, replace the modules.
The first step requires topology (i.e., connections) of the model, we use ``jit.trace`` to obtain the model graph for PyTorch.
The new shape of module is auto-inference by NNI, the unchanged parts of outputs during forward and inputs during backward are prepared for reduct.
For each type of module, we should prepare a function for module replacement.
The module replacement function returns a newly created module which is smaller.
Usage
-----
"""
# %%
# Generate a mask for the model at first.
# We usually use a NNI pruner to generate the masks then use ``ModelSpeedup`` to compact the model.
# But in fact ``ModelSpeedup`` is a relatively independent tool, so you can use it independently.
import torch
from scripts.compression_mnist_model import TorchModel, device
model = TorchModel().to(device)
# masks = {layer_name: {'weight': weight_mask, 'bias': bias_mask}}
conv1_mask = torch.ones_like(model.conv1.weight.data)
# mask the first three output channels in conv1
conv1_mask[0: 3] = 0
masks = {'conv1': {'weight': conv1_mask}}
# %%
# Show the original model structure.
print(model)
# %%
# Roughly test the original model inference speed.
import time
start = time.time()
model(torch.rand(128, 1, 28, 28).to(device))
print('Original Model - Elapsed Time : ', time.time() - start)
# %%
# Speedup the model and show the model structure after speedup.
from nni.compression.pytorch import ModelSpeedup
ModelSpeedup(model, torch.rand(10, 1, 28, 28).to(device), masks).speedup_model()
print(model)
# %%
# Roughly test the model after speedup inference speed.
start = time.time()
model(torch.rand(128, 1, 28, 28).to(device))
print('Speedup Model - Elapsed Time : ', time.time() - start)
# %%
# For combining usage of ``Pruner`` masks generation with ``ModelSpeedup``,
# please refer to :doc:`Pruning Quick Start <pruning_quick_start_mnist>`.
#
# NOTE: The current implementation supports PyTorch 1.3.1 or newer.
#
# Limitations
# -----------
#
# For PyTorch we can only replace modules, if functions in ``forward`` should be replaced,
# our current implementation does not work. One workaround is make the function a PyTorch module.
#
# If you want to speedup your own model which cannot supported by the current implementation,
# you need implement the replace function for module replacement, welcome to contribute.
#
# Speedup Results of Examples
# ---------------------------
#
# The code of these experiments can be found :githublink:`here <examples/model_compress/pruning/legacy/speedup/model_speedup.py>`.
#
# These result are tested on the `legacy pruning framework <https://nni.readthedocs.io/en/v2.6/Compression/pruning.html>`_, new results will coming soon.
#
# slim pruner example
# ^^^^^^^^^^^^^^^^^^^
#
# on one V100 GPU,
# input tensor: ``torch.randn(64, 3, 32, 32)``
#
# .. list-table::
# :header-rows: 1
# :widths: auto
#
# * - Times
# - Mask Latency
# - Speedup Latency
# * - 1
# - 0.01197
# - 0.005107
# * - 2
# - 0.02019
# - 0.008769
# * - 4
# - 0.02733
# - 0.014809
# * - 8
# - 0.04310
# - 0.027441
# * - 16
# - 0.07731
# - 0.05008
# * - 32
# - 0.14464
# - 0.10027
#
# fpgm pruner example
# ^^^^^^^^^^^^^^^^^^^
#
# on cpu,
# input tensor: ``torch.randn(64, 1, 28, 28)``\ ,
# too large variance
#
# .. list-table::
# :header-rows: 1
# :widths: auto
#
# * - Times
# - Mask Latency
# - Speedup Latency
# * - 1
# - 0.01383
# - 0.01839
# * - 2
# - 0.01167
# - 0.003558
# * - 4
# - 0.01636
# - 0.01088
# * - 40
# - 0.14412
# - 0.08268
# * - 40
# - 1.29385
# - 0.14408
# * - 40
# - 0.41035
# - 0.46162
# * - 400
# - 6.29020
# - 5.82143
#
# l1filter pruner example
# ^^^^^^^^^^^^^^^^^^^^^^^
#
# on one V100 GPU,
# input tensor: ``torch.randn(64, 3, 32, 32)``
#
# .. list-table::
# :header-rows: 1
# :widths: auto
#
# * - Times
# - Mask Latency
# - Speedup Latency
# * - 1
# - 0.01026
# - 0.003677
# * - 2
# - 0.01657
# - 0.008161
# * - 4
# - 0.02458
# - 0.020018
# * - 8
# - 0.03498
# - 0.025504
# * - 16
# - 0.06757
# - 0.047523
# * - 32
# - 0.10487
# - 0.086442
#
# APoZ pruner example
# ^^^^^^^^^^^^^^^^^^^
#
# on one V100 GPU,
# input tensor: ``torch.randn(64, 3, 32, 32)``
#
# .. list-table::
# :header-rows: 1
# :widths: auto
#
# * - Times
# - Mask Latency
# - Speedup Latency
# * - 1
# - 0.01389
# - 0.004208
# * - 2
# - 0.01628
# - 0.008310
# * - 4
# - 0.02521
# - 0.014008
# * - 8
# - 0.03386
# - 0.023923
# * - 16
# - 0.06042
# - 0.046183
# * - 32
# - 0.12421
# - 0.087113
#
# SimulatedAnnealing pruner example
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# In this experiment, we use SimulatedAnnealing pruner to prune the resnet18 on the cifar10 dataset.
# We measure the latencies and accuracies of the pruned model under different sparsity ratios, as shown in the following figure.
# The latency is measured on one V100 GPU and the input tensor is ``torch.randn(128, 3, 32, 32)``.
#
# .. image:: ../../img/SA_latency_accuracy.png
"""
Customize a new quantization algorithm
======================================
To write a new quantization algorithm, you can write a class that inherits ``nni.compression.pytorch.Quantizer``.
Then, override the member functions with the logic of your algorithm. The member function to override is ``quantize_weight``.
``quantize_weight`` directly returns the quantized weights rather than mask, because for quantization the quantized weights cannot be obtained by applying mask.
"""
from nni.compression.pytorch import Quantizer
class YourQuantizer(Quantizer):
def __init__(self, model, config_list):
"""
Suggest you to use the NNI defined spec for config
"""
super().__init__(model, config_list)
def quantize_weight(self, weight, config, **kwargs):
"""
quantize should overload this method to quantize weight tensors.
This method is effectively hooked to :meth:`forward` of the model.
Parameters
----------
weight : Tensor
weight that needs to be quantized
config : dict
the configuration for weight quantization
"""
# Put your code to generate `new_weight` here
new_weight = ...
return new_weight
def quantize_output(self, output, config, **kwargs):
"""
quantize should overload this method to quantize output.
This method is effectively hooked to `:meth:`forward` of the model.
Parameters
----------
output : Tensor
output that needs to be quantized
config : dict
the configuration for output quantization
"""
# Put your code to generate `new_output` here
new_output = ...
return new_output
def quantize_input(self, *inputs, config, **kwargs):
"""
quantize should overload this method to quantize input.
This method is effectively hooked to :meth:`forward` of the model.
Parameters
----------
inputs : Tensor
inputs that needs to be quantized
config : dict
the configuration for inputs quantization
"""
# Put your code to generate `new_input` here
new_input = ...
return new_input
def update_epoch(self, epoch_num):
pass
def step(self):
"""
Can do some processing based on the model or weights binded
in the func bind_model
"""
pass
# %%
# Customize backward function
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Sometimes it's necessary for a quantization operation to have a customized backward function,
# such as `Straight-Through Estimator <https://stackoverflow.com/questions/38361314/the-concept-of-straight-through-estimator-ste>`__\ ,
# user can customize a backward function as follow:
from nni.compression.pytorch.compressor import Quantizer, QuantGrad, QuantType
class ClipGrad(QuantGrad):
@staticmethod
def quant_backward(tensor, grad_output, quant_type):
"""
This method should be overrided by subclass to provide customized backward function,
default implementation is Straight-Through Estimator
Parameters
----------
tensor : Tensor
input of quantization operation
grad_output : Tensor
gradient of the output of quantization operation
quant_type : QuantType
the type of quantization, it can be `QuantType.INPUT`, `QuantType.WEIGHT`, `QuantType.OUTPUT`,
you can define different behavior for different types.
Returns
-------
tensor
gradient of the input of quantization operation
"""
# for quant_output function, set grad to zero if the absolute value of tensor is larger than 1
if quant_type == QuantType.OUTPUT:
grad_output[tensor.abs() > 1] = 0
return grad_output
class _YourQuantizer(Quantizer):
def __init__(self, model, config_list):
super().__init__(model, config_list)
# set your customized backward function to overwrite default backward function
self.quant_grad = ClipGrad
# %%
# If you do not customize ``QuantGrad``, the default backward is Straight-Through Estimator.
"""
Quantization Quickstart
=======================
Quantization reduces model size and speeds up inference time by reducing the number of bits required to represent weights or activations.
In NNI, both post-training quantization algorithms and quantization-aware training algorithms are supported.
Here we use `QAT_Quantizer` as an example to show the usage of quantization in NNI.
"""
# %%
# Preparation
# -----------
#
# In this tutorial, we use a simple model and pre-train on MNIST dataset.
# If you are familiar with defining a model and training in pytorch, you can skip directly to `Quantizing Model`_.
import torch
import torch.nn.functional as F
from torch.optim import SGD
from scripts.compression_mnist_model import TorchModel, trainer, evaluator, device, test_trt
# define the model
model = TorchModel().to(device)
# define the optimizer and criterion for pre-training
optimizer = SGD(model.parameters(), 1e-2)
criterion = F.nll_loss
# pre-train and evaluate the model on MNIST dataset
for epoch in range(3):
trainer(model, optimizer, criterion)
evaluator(model)
# %%
# Quantizing Model
# ----------------
#
# Initialize a `config_list`.
# Detailed about how to write ``config_list`` please refer :doc:`compression config specification <../compression/compression_config_list>`.
config_list = [{
'quant_types': ['input', 'weight'],
'quant_bits': {'input': 8, 'weight': 8},
'op_types': ['Conv2d']
}, {
'quant_types': ['output'],
'quant_bits': {'output': 8},
'op_types': ['ReLU']
}, {
'quant_types': ['input', 'weight'],
'quant_bits': {'input': 8, 'weight': 8},
'op_names': ['fc1', 'fc2']
}]
# %%
# finetuning the model by using QAT
from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
dummy_input = torch.rand(32, 1, 28, 28).to(device)
quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
quantizer.compress()
# %%
# The model has now been wrapped, and quantization targets ('quant_types' setting in `config_list`)
# will be quantized & dequantized for simulated quantization in the wrapped layers.
# QAT is a training-aware quantizer, it will update scale and zero point during training.
for epoch in range(3):
trainer(model, optimizer, criterion)
evaluator(model)
# %%
# export model and get calibration_config
model_path = "./log/mnist_model.pth"
calibration_path = "./log/mnist_calibration.pth"
calibration_config = quantizer.export_model(model_path, calibration_path)
print("calibration_config: ", calibration_config)
# %%
# build tensorRT engine to make a real speedup, for more information about speedup, please refer :doc:`quantization_speedup`.
from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
input_shape = (32, 1, 28, 28)
engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
engine.compress()
test_trt(engine)
"""
SpeedUp Model with Calibration Config
======================================
Introduction
------------
Deep learning network has been computational intensive and memory intensive
which increases the difficulty of deploying deep neural network model. Quantization is a
fundamental technology which is widely used to reduce memory footprint and speedup inference
process. Many frameworks begin to support quantization, but few of them support mixed precision
quantization and get real speedup. Frameworks like `HAQ: Hardware-Aware Automated Quantization with Mixed Precision <https://arxiv.org/pdf/1811.08886.pdf>`__\, only support simulated mixed precision quantization which will
not speedup the inference process. To get real speedup of mixed precision quantization and
help people get the real feedback from hardware, we design a general framework with simple interface to allow NNI quantization algorithms to connect different
DL model optimization backends (e.g., TensorRT, NNFusion), which gives users an end-to-end experience that after quantizing their model
with quantization algorithms, the quantized model can be directly speeded up with the connected optimization backend. NNI connects
TensorRT at this stage, and will support more backends in the future.
Design and Implementation
-------------------------
To support speeding up mixed precision quantization, we divide framework into two part, frontend and backend.
Frontend could be popular training frameworks such as PyTorch, TensorFlow etc. Backend could be inference
framework for different hardwares, such as TensorRT. At present, we support PyTorch as frontend and
TensorRT as backend. To convert PyTorch model to TensorRT engine, we leverage onnx as intermediate graph
representation. In this way, we convert PyTorch model to onnx model, then TensorRT parse onnx
model to generate inference engine.
Quantization aware training combines NNI quantization algorithm 'QAT' and NNI quantization speedup tool.
Users should set config to train quantized model using QAT algorithm(please refer to :doc:`NNI Quantization Algorithms <../compression/quantizer>` ).
After quantization aware training, users can get new config with calibration parameters and model with quantized weight. By passing new config and model to quantization speedup tool, users can get real mixed precision speedup engine to do inference.
After getting mixed precision engine, users can do inference with input data.
Note
* Recommend using "cpu"(host) as data device(for both inference data and calibration data) since data should be on host initially and it will be transposed to device before inference. If data type is not "cpu"(host), this tool will transpose it to "cpu" which may increases unnecessary overhead.
* User can also do post-training quantization leveraging TensorRT directly(need to provide calibration dataset).
* Not all op types are supported right now. At present, NNI supports Conv, Linear, Relu and MaxPool. More op types will be supported in the following release.
Prerequisite
------------
CUDA version >= 11.0
TensorRT version >= 7.2
Note
* If you haven't installed TensorRT before or use the old version, please refer to `TensorRT Installation Guide <https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html>`__\
Usage
-----
"""
# %%
import torch
import torch.nn.functional as F
from torch.optim import SGD
from scripts.compression_mnist_model import TorchModel, device, trainer, evaluator, test_trt
config_list = [{
'quant_types': ['input', 'weight'],
'quant_bits': {'input': 8, 'weight': 8},
'op_types': ['Conv2d']
}, {
'quant_types': ['output'],
'quant_bits': {'output': 8},
'op_types': ['ReLU']
}, {
'quant_types': ['input', 'weight'],
'quant_bits': {'input': 8, 'weight': 8},
'op_names': ['fc1', 'fc2']
}]
model = TorchModel().to(device)
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.5)
criterion = F.nll_loss
dummy_input = torch.rand(32, 1, 28, 28).to(device)
from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input)
quantizer.compress()
# %%
# finetuning the model by using QAT
for epoch in range(3):
trainer(model, optimizer, criterion)
evaluator(model)
# %%
# export model and get calibration_config
import os
os.makedirs('log', exist_ok=True)
model_path = "./log/mnist_model.pth"
calibration_path = "./log/mnist_calibration.pth"
calibration_config = quantizer.export_model(model_path, calibration_path)
print("calibration_config: ", calibration_config)
# %%
# build tensorRT engine to make a real speedup
from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
input_shape = (32, 1, 28, 28)
engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
engine.compress()
test_trt(engine)
# %%
# Note that NNI also supports post-training quantization directly, please refer to complete examples for detail.
#
# For complete examples please refer to :githublink:`the code <examples/model_compress/quantization/mixed_precision_speedup_mnist.py>`.
#
# For more parameters about the class 'TensorRTModelSpeedUp', you can refer to :doc:`Model Compression API Reference <../reference/compression/quantization_speedup>`.
#
# Mnist test
# ^^^^^^^^^^
#
# on one GTX2080 GPU,
# input tensor: ``torch.randn(128, 1, 28, 28)``
#
# .. list-table::
# :header-rows: 1
# :widths: auto
#
# * - quantization strategy
# - Latency
# - accuracy
# * - all in 32bit
# - 0.001199961
# - 96%
# * - mixed precision(average bit 20.4)
# - 0.000753688
# - 96%
# * - all in 8bit
# - 0.000229869
# - 93.7%
#
# Cifar10 resnet18 test (train one epoch)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# on one GTX2080 GPU,
# input tensor: ``torch.randn(128, 3, 32, 32)``
#
# .. list-table::
# :header-rows: 1
# :widths: auto
#
# * - quantization strategy
# - Latency
# - accuracy
# * - all in 32bit
# - 0.003286268
# - 54.21%
# * - mixed precision(average bit 11.55)
# - 0.001358022
# - 54.78%
# * - all in 8bit
# - 0.000859139
# - 52.81%
from pathlib import Path
root_path = Path(__file__).parent.parent
# define the model
import torch
from torch import nn
from torch.nn import functional as F
class TorchModel(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 6, 5, 1)
self.conv2 = nn.Conv2d(6, 16, 5, 1)
self.fc1 = nn.Linear(16 * 4 * 4, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.relu1 = nn.ReLU()
self.relu2 = nn.ReLU()
self.relu3 = nn.ReLU()
self.relu4 = nn.ReLU()
self.pool1 = nn.MaxPool2d((2, 2))
self.pool2 = nn.MaxPool2d((2, 2))
def forward(self, x):
x = self.pool1(self.relu1(self.conv1(x)))
x = self.pool2(self.relu2(self.conv2(x)))
x = torch.flatten(x, 1)
x = self.relu3(self.fc1(x))
x = self.relu4(self.fc2(x))
x = self.fc3(x)
return F.log_softmax(x, dim=1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
# load data
from torchvision import datasets, transforms
train_loader = torch.utils.data.DataLoader(
datasets.MNIST(root_path / 'data', train=True, download=True, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])), batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST(root_path / 'data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])), batch_size=1000, shuffle=True)
# define the trainer and evaluator
def trainer(model, optimizer, criterion):
# training the model
model.train()
for data, target in train_loader:
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def evaluator(model):
# evaluating the model accuracy and average test loss
model.eval()
test_loss = 0
correct = 0
test_dataset_length = len(test_loader.dataset)
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
# sum up batch loss
test_loss += F.nll_loss(output, target, reduction='sum').item()
# get the index of the max log-probability
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= test_dataset_length
accuracy = 100. * correct / test_dataset_length
print('Average test loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(test_loss, correct, test_dataset_length, accuracy))
def test_trt(engine):
test_loss = 0
correct = 0
time_elasped = 0
for data, target in test_loader:
output, time = engine.inference(data)
test_loss += F.nll_loss(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
time_elasped += time
test_loss /= len(test_loader.dataset)
print('Loss: {} Accuracy: {}%'.format(
test_loss, 100 * correct / len(test_loader.dataset)))
print("Inference elapsed_time (whole dataset): {}s".format(time_elasped))
...@@ -10,6 +10,7 @@ from .runtime.log import init_logger ...@@ -10,6 +10,7 @@ from .runtime.log import init_logger
init_logger() init_logger()
from .common.serializer import trace, dump, load from .common.serializer import trace, dump, load
from .experiment import Experiment
from .runtime.env_vars import dispatcher_env_vars from .runtime.env_vars import dispatcher_env_vars
from .utils import ClassArgsValidator from .utils import ClassArgsValidator
...@@ -19,7 +20,7 @@ if dispatcher_env_vars.SDK_PROCESS != 'dispatcher': ...@@ -19,7 +20,7 @@ if dispatcher_env_vars.SDK_PROCESS != 'dispatcher':
from .common.nas_utils import training_update from .common.nas_utils import training_update
class NoMoreTrialError(Exception): class NoMoreTrialError(Exception):
def __init__(self, ErrorInfo): def __init__(self, ErrorInfo='Search space fully explored'):
super().__init__(self) super().__init__(self)
self.errorinfo = ErrorInfo self.errorinfo = ErrorInfo
......
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT license. # Licensed under the MIT license.
from __future__ import annotations
import inspect import inspect
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import overload, Union, List
from nni.experiment import Experiment, ExperimentConfig from nni.experiment import Experiment, ExperimentConfig
from nni.algorithms.compression.pytorch.auto_compress.interface import AbstractAutoCompressionModule from nni.algorithms.compression.pytorch.auto_compress.interface import AbstractAutoCompressionModule
...@@ -11,49 +12,19 @@ from nni.algorithms.compression.pytorch.auto_compress.interface import AbstractA ...@@ -11,49 +12,19 @@ from nni.algorithms.compression.pytorch.auto_compress.interface import AbstractA
class AutoCompressionExperiment(Experiment): class AutoCompressionExperiment(Experiment):
@overload def __init__(self, auto_compress_module: AbstractAutoCompressionModule, config_or_platform: ExperimentConfig | str | list[str]) -> None:
def __init__(self, auto_compress_module: AbstractAutoCompressionModule, config: ExperimentConfig) -> None:
"""
Prepare an experiment.
Use `Experiment.run()` to launch it.
Parameters
----------
auto_compress_module
The module provided by the user implements the `AbstractAutoCompressionModule` interfaces.
Remember put the module file under `trial_code_directory`.
config
Experiment configuration.
"""
...
@overload
def __init__(self, auto_compress_module: AbstractAutoCompressionModule, training_service: Union[str, List[str]]) -> None:
""" """
Prepare an experiment, leaving configuration fields to be set later. Prepare an auto compression experiment.
Example usage::
experiment = Experiment(auto_compress_module, 'remote')
experiment.config.trial_command = 'python3 trial.py'
experiment.config.machines.append(RemoteMachineConfig(ip=..., user_name=...))
...
experiment.run(8080)
Parameters Parameters
---------- ----------
auto_compress_module auto_compress_module
The module provided by the user implements the `AbstractAutoCompressionModule` interfaces. The module provided by the user implements the `AbstractAutoCompressionModule` interfaces.
Remember put the module file under `trial_code_directory`. Remember put the module file under `trial_code_directory`.
training_service config_or_platform
Name of training service. Experiment configuration or training service name.
Supported value: "local", "remote", "openpai", "aml", "kubeflow", "frameworkcontroller", "adl" and hybrid training service.
""" """
... super().__init__(config_or_platform)
def __init__(self, auto_compress_module: AbstractAutoCompressionModule, config=None, training_service=None):
super().__init__(config, training_service)
self.module_file_path = str(PurePath(inspect.getfile(auto_compress_module))) self.module_file_path = str(PurePath(inspect.getfile(auto_compress_module)))
self.module_name = auto_compress_module.__name__ self.module_name = auto_compress_module.__name__
......
...@@ -201,7 +201,7 @@ class AutoCompressPruner(Pruner): ...@@ -201,7 +201,7 @@ class AutoCompressPruner(Pruner):
ADMMpruner.export_model(os.path.join(self._experiment_data_dir, 'model_admm_masked.pth'), os.path.join( ADMMpruner.export_model(os.path.join(self._experiment_data_dir, 'model_admm_masked.pth'), os.path.join(
self._experiment_data_dir, 'mask.pth')) self._experiment_data_dir, 'mask.pth'))
# use speed up to prune the model before next iteration, # use speedup to prune the model before next iteration,
# because SimulatedAnnealingPruner & ADMMPruner don't take masked models # because SimulatedAnnealingPruner & ADMMPruner don't take masked models
self._model_to_prune.load_state_dict(torch.load(os.path.join( self._model_to_prune.load_state_dict(torch.load(os.path.join(
self._experiment_data_dir, 'model_admm_masked.pth'))) self._experiment_data_dir, 'model_admm_masked.pth')))
......
...@@ -35,7 +35,7 @@ class DependencyAwarePruner(Pruner): ...@@ -35,7 +35,7 @@ class DependencyAwarePruner(Pruner):
if self.dependency_aware: if self.dependency_aware:
if not self._supported_dependency_aware(): if not self._supported_dependency_aware():
raise ValueError('This pruner does not support dependency aware!') raise ValueError('This pruner does not support dependency-aware!')
errmsg = "When dependency_aware is set, the dummy_input should not be None" errmsg = "When dependency_aware is set, the dummy_input should not be None"
assert self.dummy_input is not None, errmsg assert self.dummy_input is not None, errmsg
......
...@@ -10,7 +10,7 @@ import torch ...@@ -10,7 +10,7 @@ import torch
from schema import And, Optional from schema import And, Optional
from nni.compression.pytorch.compressor import Pruner from nni.compression.pytorch.compressor import Pruner
from nni.compression.pytorch.utils.config_validation import PrunerSchema from nni.compression.pytorch.utils.config_validation import PrunerSchema
from nni.compression.pytorch.utils.sensitivity_analysis import SensitivityAnalysis from nni.compression.pytorch.utils import SensitivityAnalysis
from .constants_pruner import PRUNER_DICT from .constants_pruner import PRUNER_DICT
......
...@@ -22,9 +22,74 @@ class ClipGrad(QuantGrad): ...@@ -22,9 +22,74 @@ class ClipGrad(QuantGrad):
class BNNQuantizer(Quantizer): class BNNQuantizer(Quantizer):
"""Binarized Neural Networks, as defined in: r"""
Binarized Neural Networks: Training Deep Neural Networks with Weights and Outputs Constrained to +1 or -1 Binarized Neural Networks, as defined in:
(https://arxiv.org/abs/1602.02830) `Binarized Neural Networks: Training Deep Neural Networks with Weights and
Activations Constrained to +1 or -1 <https://arxiv.org/abs/1602.02830>`__,
..
We introduce a method to train Binarized Neural Networks (BNNs) - neural networks with binary weights and activations at run-time.
At training-time the binary weights and activations are used for computing the parameters gradients. During the forward pass,
BNNs drastically reduce memory size and accesses, and replace most arithmetic operations with bit-wise operations,
which is expected to substantially improve power-efficiency.
Parameters
----------
model : torch.nn.Module
Model to be quantized.
config_list : List[Dict]
List of configurations for quantization. Supported keys for dict:
- quant_types : List[str]
Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
- quant_bits : Union[int, Dict[str, int]]
Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
When the type is int, all quantization types share same bits length.
- op_types : List[str]
Types of nn.module you want to apply quantization, eg. 'Conv2d'.
- op_names : List[str]
Names of nn.module you want to apply quantization, eg. 'conv1'.
- exclude : bool
Set True then the layers setting by op_types and op_names will be excluded from quantization.
optimizer : torch.optim.Optimizer
Optimizer is required in `BNNQuantizer`, NNI will patch the optimizer and count the optimize step number.
Examples
--------
>>> from nni.algorithms.compression.pytorch.quantization import BNNQuantizer
>>> model = ...
>>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
>>> optimizer = ...
>>> quantizer = BNNQuantizer(model, config_list, optimizer)
>>> quantizer.compress()
>>> # Training Process...
For detailed example please refer to
:githublink:`examples/model_compress/quantization/BNN_quantizer_cifar10.py
<examples/model_compress/quantization/BNN_quantizer_cifar10.py>`.
Notes
-----
**Results**
We implemented one of the experiments in
`Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1
<https://arxiv.org/abs/1602.02830>`__,
we quantized the **VGGNet** for CIFAR-10 in the paper. Our experiments results are as follows:
.. list-table::
:header-rows: 1
:widths: auto
* - Model
- Accuracy
* - VGGNet
- 86.93%
The experiments code can be found at
:githublink:`examples/model_compress/quantization/BNN_quantizer_cifar10.py
<examples/model_compress/quantization/BNN_quantizer_cifar10.py>`
""" """
def __init__(self, model, config_list, optimizer): def __init__(self, model, config_list, optimizer):
......
...@@ -13,9 +13,45 @@ logger = logging.getLogger(__name__) ...@@ -13,9 +13,45 @@ logger = logging.getLogger(__name__)
class DoReFaQuantizer(Quantizer): class DoReFaQuantizer(Quantizer):
"""Quantizer using the DoReFa scheme, as defined in: r"""
Zhou et al., DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients Quantizer using the DoReFa scheme, as defined in:
(https://arxiv.org/abs/1606.06160) `DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients <https://arxiv.org/abs/1606.06160>`__,
authors Shuchang Zhou and Yuxin Wu provide an algorithm named DoReFa to quantize the weight, activation and gradients with training.
Parameters
----------
model : torch.nn.Module
Model to be quantized.
config_list : List[Dict]
List of configurations for quantization. Supported keys for dict:
- quant_types : List[str]
Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
- quant_bits : Union[int, Dict[str, int]]
Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
When the type is int, all quantization types share same bits length.
- op_types : List[str]
Types of nn.module you want to apply quantization, eg. 'Conv2d'.
- op_names : List[str]
Names of nn.module you want to apply quantization, eg. 'conv1'.
- exclude : bool
Set True then the layers setting by op_types and op_names will be excluded from quantization.
optimizer : torch.optim.Optimizer
Optimizer is required in `DoReFaQuantizer`, NNI will patch the optimizer and count the optimize step number.
Examples
--------
>>> from nni.algorithms.compression.pytorch.quantization import DoReFaQuantizer
>>> model = ...
>>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
>>> optimizer = ...
>>> quantizer = DoReFaQuantizer(model, config_list, optimizer)
>>> quantizer.compress()
>>> # Training Process...
For detailed example please refer to
:githublink:`examples/model_compress/quantization/DoReFaQuantizer_torch_mnist.py
<examples/model_compress/quantization/DoReFaQuantizer_torch_mnist.py>`.
""" """
def __init__(self, model, config_list, optimizer): def __init__(self, model, config_list, optimizer):
......
...@@ -11,35 +11,56 @@ logger = logging.getLogger(__name__) ...@@ -11,35 +11,56 @@ logger = logging.getLogger(__name__)
class LsqQuantizer(Quantizer): class LsqQuantizer(Quantizer):
"""Quantizer defined in: r"""
Learned Step Size Quantization (ICLR 2020) Quantizer defined in: `LEARNED STEP SIZE QUANTIZATION <https://arxiv.org/pdf/1902.08153.pdf>`__,
https://arxiv.org/pdf/1902.08153.pdf authors Steven K. Esser and Jeffrey L. McKinstry provide an algorithm to train the scales with gradients.
..
The authors introduce a novel means to estimate and scale the task loss gradient at each weight and activation
layer's quantizer step size, such that it can be learned in conjunction with other network parameters.
Parameters
----------
model : torch.nn.Module
The model to be quantized.
config_list : List[Dict]
List of configurations for quantization. Supported keys for dict:
- quant_types : List[str]
Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
- quant_bits : Union[int, Dict[str, int]]
Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
When the type is int, all quantization types share same bits length.
- op_types : List[str]
Types of nn.module you want to apply quantization, eg. 'Conv2d'.
- op_names : List[str]
Names of nn.module you want to apply quantization, eg. 'conv1'.
- exclude : bool
Set True then the layers setting by op_types and op_names will be excluded from quantization.
optimizer : torch.optim.Optimizer
Optimizer is required in `LsqQuantizer`, NNI will patch the optimizer and count the optimize step number.
dummy_input : Tuple[torch.Tensor]
Inputs to the model, which are used to get the graph of the module. The graph is used to find Conv-Bn patterns.
And then the batch normalization folding would be enabled. If dummy_input is not given,
the batch normalization folding would be disabled.
Examples
--------
>>> from nni.algorithms.compression.pytorch.quantization import LsqQuantizer
>>> model = ...
>>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
>>> optimizer = ...
>>> dummy_input = torch.rand(...)
>>> quantizer = LsqQuantizer(model, config_list, optimizer, dummy_input=dummy_input)
>>> quantizer.compress()
>>> # Training Process...
For detailed example please refer to
:githublink:`examples/model_compress/quantization/LSQ_torch_quantizer.py <examples/model_compress/quantization/LSQ_torch_quantizer.py>`.
""" """
def __init__(self, model, config_list, optimizer, dummy_input=None): def __init__(self, model, config_list, optimizer, dummy_input=None):
"""
Parameters
----------
model : torch.nn.Module
the model to be quantized
config_list : list of dict
list of configurations for quantization
supported keys for dict:
- quant_types : list of string
type of quantization you want to apply, currently support 'weight', 'input', 'output'
- quant_bits : int or dict of {str : int}
bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
when the type is int, all quantization types share same bits length
- quant_start_step : int
disable quantization until model are run by certain number of steps, this allows the network to enter a more stable
state where output quantization ranges do not exclude a significant fraction of values, default value is 0
- op_types : list of string
types of nn.module you want to apply quantization, eg. 'Conv2d'
- dummy_input : tuple of tensor
inputs to the model, which are used to get the graph of the module. The graph is used to find
Conv-Bn patterns. And then the batch normalization folding would be enabled. If dummy_input is not
given, the batch normalization folding would be disabled.
"""
assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type" assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type"
super().__init__(model, config_list, optimizer, dummy_input) super().__init__(model, config_list, optimizer, dummy_input)
device = next(model.parameters()).device device = next(model.parameters()).device
......
...@@ -12,7 +12,32 @@ logger = logging.getLogger(__name__) ...@@ -12,7 +12,32 @@ logger = logging.getLogger(__name__)
class NaiveQuantizer(Quantizer): class NaiveQuantizer(Quantizer):
"""quantize weight to 8 bits r"""
Quantize weight to 8 bits directly.
Parameters
----------
model : torch.nn.Module
Model to be quantized.
config_list : List[Dict]
List of configurations for quantization. Supported keys:
- quant_types : List[str]
Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
- quant_bits : Union[int, Dict[str, int]]
Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
when the type is int, all quantization types share same bits length.
- op_types : List[str]
Types of nn.module you want to apply quantization, eg. 'Conv2d'.
- op_names : List[str]
Names of nn.module you want to apply quantization, eg. 'conv1'.
- exclude : bool
Set True then the layers setting by op_types and op_names will be excluded from quantization.
Examples
--------
>>> from nni.algorithms.compression.pytorch.quantization import NaiveQuantizer
>>> model = ...
>>> NaiveQuantizer(model).compress()
""" """
def __init__(self, model, config_list, optimizer=None): def __init__(self, model, config_list, optimizer=None):
......
...@@ -14,7 +14,12 @@ logger = logging.getLogger(__name__) ...@@ -14,7 +14,12 @@ logger = logging.getLogger(__name__)
class ObserverQuantizer(Quantizer): class ObserverQuantizer(Quantizer):
"""This quantizer uses observers to record weight/output statistics to get quantization information. r"""
Observer quantizer is a framework of post-training quantization.
It will insert observers into the place where the quantization will happen.
During quantization calibration, each observer will record all the tensors it 'sees'.
These tensors will be used to calculate the quantization statistics after calibration.
The whole process can be divided into three steps: The whole process can be divided into three steps:
1. It will register observers to the place where quantization would happen (just like registering hooks). 1. It will register observers to the place where quantization would happen (just like registering hooks).
...@@ -23,6 +28,66 @@ class ObserverQuantizer(Quantizer): ...@@ -23,6 +28,66 @@ class ObserverQuantizer(Quantizer):
Note that the observer type, tensor dtype and quantization qscheme are hard coded for now. Their customization Note that the observer type, tensor dtype and quantization qscheme are hard coded for now. Their customization
are under development and will be ready soon. are under development and will be ready soon.
Parameters
----------
model : torch.nn.Module
Model to be quantized.
config_list : List[Dict]
List of configurations for quantization. Supported keys:
- quant_types : List[str]
Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
- quant_bits : Union[int, Dict[str, int]]
Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
when the type is int, all quantization types share same bits length.
- op_types : List[str]
Types of nn.module you want to apply quantization, eg. 'Conv2d'.
- op_names : List[str]
Names of nn.module you want to apply quantization, eg. 'conv1'.
- exclude : bool
Set True then the layers setting by op_types and op_names will be excluded from quantization.
optimizer : torch.optim.Optimizer
Optimizer is optional in `ObserverQuantizer`.
Examples
--------
>>> from nni.algorithms.compression.pytorch.quantization import ObserverQuantizer
>>> model = ...
>>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
>>> quantizer = ObserverQuantizer(model, config_list)
>>> # define a calibration function
>>> def calibration(model, calib_loader):
>>> model.eval()
>>> with torch.no_grad():
>>> for data, _ in calib_loader:
>>> model(data)
>>> calibration(model, calib_loader)
>>> quantizer.compress()
For detailed example please refer to
:githublink:`examples/model_compress/quantization/observer_quantizer.py <examples/model_compress/quantization/observer_quantizer.py>`.
.. note::
This quantizer is still under development for now. Some quantizer settings are hard-coded:
- weight observer: per_tensor_symmetric, qint8
- output observer: per_tensor_affine, quint8, reduce_range=True
Other settings (such as quant_type and op_names) can be configured.
Notes
-----
**About the compress API**
Before the `compress` API is called, the model will only record tensors' statistics and no quantization process will be executed.
After the `compress` API is called, the model will NOT record tensors' statistics any more. The quantization scale and zero point will
be generated for each tensor and will be used to quantize each tensor during inference (we call it evaluation mode)
**About calibration**
Usually we pick up about 100 training/evaluation examples for calibration. If you found the accuracy is a bit low, try
to reduce the number of calibration examples.
""" """
def __init__(self, model, config_list, optimizer=None): def __init__(self, model, config_list, optimizer=None):
......
...@@ -107,36 +107,151 @@ def update_ema(biased_ema, value, decay): ...@@ -107,36 +107,151 @@ def update_ema(biased_ema, value, decay):
class QAT_Quantizer(Quantizer): class QAT_Quantizer(Quantizer):
"""Quantizer defined in: r"""
Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference Quantizer defined in:
http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf `Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference
<http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf>`__
Authors Benoit Jacob and Skirmantas Kligys provide an algorithm to quantize the model with training.
..
We propose an approach that simulates quantization effects in the forward pass of training.
Backpropagation still happens as usual, and all weights and biases are stored in floating point
so that they can be easily nudged by small amounts.
The forward propagation pass however simulates quantized inference as it will happen in the inference engine,
by implementing in floating-point arithmetic the rounding behavior of the quantization scheme:
* Weights are quantized before they are convolved with the input. If batch normalization (see [17]) is used for the layer,
the batch normalization parameters are “folded into” the weights before quantization.
* Activations are quantized at points where they would be during inference,
e.g. after the activation function is applied to a convolutional or fully connected layer’s output,
or after a bypass connection adds or concatenates the outputs of several layers together such as in ResNets.
Parameters
----------
model : torch.nn.Module
Model to be quantized.
config_list : List[Dict]
List of configurations for quantization. Supported keys for dict:
- quant_types : List[str]
Type of quantization you want to apply, currently support 'weight', 'input', 'output'.
- quant_bits : Union[int, Dict[str, int]]
Bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
When the type is int, all quantization types share same bits length.
- quant_start_step : int
Disable quantization until model are run by certain number of steps, this allows the network to enter a more stable.
State where output quantization ranges do not exclude a significant fraction of values, default value is 0.
- op_types : List[str]
Types of nn.module you want to apply quantization, eg. 'Conv2d'.
- op_names : List[str]
Names of nn.module you want to apply quantization, eg. 'conv1'.
- exclude : bool
Set True then the layers setting by op_types and op_names will be excluded from quantization.
optimizer : torch.optim.Optimizer
Optimizer is required in `QAT_Quantizer`, NNI will patch the optimizer and count the optimize step number.
dummy_input : Tuple[torch.Tensor]
Inputs to the model, which are used to get the graph of the module. The graph is used to find Conv-Bn patterns.
And then the batch normalization folding would be enabled. If dummy_input is not given,
the batch normalization folding would be disabled.
Examples
--------
>>> from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
>>> model = ...
>>> config_list = [{'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_types': ['Conv2d']}]
>>> optimizer = ...
>>> dummy_input = torch.rand(...)
>>> quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input=dummy_input)
>>> quantizer.compress()
>>> # Training Process...
For detailed example please refer to
:githublink:`examples/model_compress/quantization/QAT_torch_quantizer.py <examples/model_compress/quantization/QAT_torch_quantizer.py>`.
Notes
-----
**Batch normalization folding**
Batch normalization folding is supported in QAT quantizer. It can be easily enabled by passing an argument `dummy_input` to
the quantizer, like:
.. code-block:: python
# assume your model takes an input of shape (1, 1, 28, 28)
# and dummy_input must be on the same device as the model
dummy_input = torch.randn(1, 1, 28, 28)
# pass the dummy_input to the quantizer
quantizer = QAT_Quantizer(model, config_list, optimizer, dummy_input=dummy_input)
The quantizer will automatically detect Conv-BN patterns and simulate batch normalization folding process in the training
graph. Note that when the quantization aware training process is finished, the folded weight/bias would be restored after calling
`quantizer.export_model`.
**Quantization dtype and scheme customization**
Different backends on different devices use different quantization strategies (i.e. dtype (int or uint) and
scheme (per-tensor or per-channel and symmetric or affine)). QAT quantizer supports customization of mainstream dtypes and schemes.
There are two ways to set them. One way is setting them globally through a function named `set_quant_scheme_dtype` like:
.. code-block:: python
from nni.compression.pytorch.quantization.settings import set_quant_scheme_dtype
# This will set all the quantization of 'input' in 'per_tensor_affine' and 'uint' manner
set_quant_scheme_dtype('input', 'per_tensor_affine', 'uint)
# This will set all the quantization of 'output' in 'per_tensor_symmetric' and 'int' manner
set_quant_scheme_dtype('output', 'per_tensor_symmetric', 'int')
# This will set all the quantization of 'weight' in 'per_channel_symmetric' and 'int' manner
set_quant_scheme_dtype('weight', 'per_channel_symmetric', 'int')
The other way is more detailed. You can customize the dtype and scheme in each quantization config list like:
.. code-block:: python
config_list = [{
'quant_types': ['weight'],
'quant_bits': 8,
'op_types':['Conv2d', 'Linear'],
'quant_dtype': 'int',
'quant_scheme': 'per_channel_symmetric'
}, {
'quant_types': ['output'],
'quant_bits': 8,
'quant_start_step': 7000,
'op_types':['ReLU6'],
'quant_dtype': 'uint',
'quant_scheme': 'per_tensor_affine'
}]
**Multi-GPU training**
QAT quantizer natively supports multi-gpu training (DataParallel and DistributedDataParallel). Note that the quantizer
instantiation should happen before you wrap your model with DataParallel or DistributedDataParallel. For example:
.. code-block:: python
from torch.nn.parallel import DistributedDataParallel as DDP
from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
model = define_your_model()
model = QAT_Quantizer(model, **other_params) # <--- QAT_Quantizer instantiation
model = DDP(model)
for i in range(epochs):
train(model)
eval(model)
""" """
def __init__(self, model, config_list, optimizer, dummy_input=None): def __init__(self, model, config_list, optimizer, dummy_input=None):
"""
Parameters
----------
layer : LayerInfo
the layer to quantize
config_list : list of dict
list of configurations for quantization
supported keys for dict:
- quant_types : list of string
type of quantization you want to apply, currently support 'weight', 'input', 'output'
- quant_bits : int or dict of {str : int}
bits length of quantization, key is the quantization type, value is the length, eg. {'weight', 8},
when the type is int, all quantization types share same bits length
- quant_start_step : int
disable quantization until model are run by certain number of steps, this allows the network to enter a more stable
state where output quantization ranges do not exclude a significant fraction of values, default value is 0
- op_types : list of string
types of nn.module you want to apply quantization, eg. 'Conv2d'
- dummy_input : tuple of tensor
inputs to the model, which are used to get the graph of the module. The graph is used to find
Conv-Bn patterns. And then the batch normalization folding would be enabled. If dummy_input is not
given, the batch normalization folding would be disabled.
"""
assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type" assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type"
super().__init__(model, config_list, optimizer, dummy_input) super().__init__(model, config_list, optimizer, dummy_input)
self.quant_grad = QATGrad.apply self.quant_grad = QATGrad.apply
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from .one_shot_pruner import * from .one_shot_pruner import *
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment