Unverified Commit f0e3c584 authored by lin bin's avatar lin bin Committed by GitHub
Browse files

Combine tensorrt tool with NNI quantization algorithms. (#3488)

parent 80bc9537
import torch
import torch.nn.functional as F
from torchvision import datasets, transforms
from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
class Mnist(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = torch.nn.Conv2d(1, 20, 5, 1)
self.conv2 = torch.nn.Conv2d(20, 50, 5, 1)
self.fc1 = torch.nn.Linear(4 * 4 * 50, 500)
self.fc2 = torch.nn.Linear(500, 10)
self.relu1 = torch.nn.ReLU6()
self.relu2 = torch.nn.ReLU6()
self.relu3 = torch.nn.ReLU6()
self.max_pool1 = torch.nn.MaxPool2d(2, 2)
self.max_pool2 = torch.nn.MaxPool2d(2, 2)
def forward(self, x):
x = self.relu1(self.conv1(x))
x = self.max_pool1(x)
x = self.relu2(self.conv2(x))
x = self.max_pool2(x)
x = x.view(-1, 4 * 4 * 50)
x = self.relu3(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
def train(model, device, train_loader, optimizer):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % 100 == 0:
print('{:2.0f}% Loss {}'.format(100 * batch_idx / len(train_loader), loss.item()))
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('Loss: {} Accuracy: {}%)\n'.format(
test_loss, 100 * correct / len(test_loader.dataset)))
def test_trt(engine, test_loader):
test_loss = 0
correct = 0
time_elasped = 0
for data, target in test_loader:
output, time = engine.inference(data)
test_loss += F.nll_loss(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
time_elasped += time
test_loss /= len(test_loader.dataset)
print('Loss: {} Accuracy: {}%'.format(
test_loss, 100 * correct / len(test_loader.dataset)))
print("Inference elapsed_time (whole dataset): {}s".format(time_elasped))
def post_training_quantization_example(train_loader, test_loader, device):
model = Mnist()
config = {
'conv1':{'weight_bit':8, 'activation_bit':8},
'conv2':{'weight_bit':32, 'activation_bit':32},
'fc1':{'weight_bit':16, 'activation_bit':16},
'fc2':{'weight_bit':8, 'activation_bit':8}
}
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
model.to(device)
for epoch in range(1):
print('# Epoch {} #'.format(epoch))
train(model, device, train_loader, optimizer)
test(model, device, test_loader)
batch_size = 32
input_shape = (batch_size, 1, 28, 28)
engine = ModelSpeedupTensorRT(model, input_shape, config=config, calib_data_loader=train_loader, batchsize=batch_size)
engine.compress()
test_trt(engine, test_loader)
def quantization_aware_training_example(train_loader, test_loader, device):
model = Mnist()
configure_list = [{
'quant_types': ['weight', 'output'],
'quant_bits': {'weight':8, 'output':8},
'op_names': ['conv1']
}, {
'quant_types': ['output'],
'quant_bits': {'output':8},
'op_names': ['relu1']
}, {
'quant_types': ['weight', 'output'],
'quant_bits': {'weight':8, 'output':8},
'op_names': ['conv2']
}, {
'quant_types': ['output'],
'quant_bits': {'output':8},
'op_names': ['relu2']
}
]
# finetune the model by using QAT
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
quantizer = QAT_Quantizer(model, configure_list, optimizer)
quantizer.compress()
model.to(device)
for epoch in range(1):
print('# Epoch {} #'.format(epoch))
train(model, device, train_loader, optimizer)
test(model, device, test_loader)
model_path = "mnist_model.pth"
calibration_path = "mnist_calibration.pth"
calibration_config = quantizer.export_model(model_path, calibration_path)
test(model, device, test_loader)
print("calibration_config: ", calibration_config)
batch_size = 32
input_shape = (batch_size, 1, 28, 28)
engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=batch_size)
engine.compress()
test_trt(engine, test_loader)
def main():
torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('data', train=True, download=True, transform=trans),
batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('data', train=False, transform=trans),
batch_size=1000, shuffle=True)
# post-training quantization on TensorRT
post_training_quantization_example(train_loader, test_loader, device)
# combine NNI quantization algorithm QAT with backend framework TensorRT
quantization_aware_training_example(train_loader, test_loader, device)
if __name__ == '__main__':
main()
\ No newline at end of file
......@@ -152,22 +152,23 @@ class QAT_Quantizer(Quantizer):
for layer, config in modules_to_compress:
layer.module.register_buffer("zero_point", torch.Tensor([0.0]))
layer.module.register_buffer("scale", torch.Tensor([1.0]))
layer.module.register_buffer('ema_decay', torch.Tensor([0.99]))
if "weight" in config.get("quant_types", []):
layer.module.register_buffer('weight_bit', torch.zeros(1))
layer.module.register_buffer('tracked_min_input', torch.zeros(1))
layer.module.register_buffer('tracked_max_input', torch.zeros(1))
if "output" in config.get("quant_types", []):
layer.module.register_buffer('activation_bit', torch.zeros(1))
layer.module.register_buffer('ema_decay', torch.Tensor([0.99]))
layer.module.register_buffer('tracked_min_biased', torch.zeros(1))
layer.module.register_buffer('tracked_min', torch.zeros(1))
layer.module.register_buffer('tracked_max_biased', torch.zeros(1))
layer.module.register_buffer('tracked_max', torch.zeros(1))
layer.module.register_buffer('tracked_min_activation', torch.zeros(1))
layer.module.register_buffer('tracked_max_activation', torch.zeros(1))
def _del_simulated_attr(self, module):
"""
delete redundant parameters in quantize module
"""
del_attr_list = ['old_weight', 'ema_decay', 'tracked_min_biased', 'tracked_max_biased', 'tracked_min', \
'tracked_max', 'scale', 'zero_point', 'weight_bit', 'activation_bit']
del_attr_list = ['old_weight', 'ema_decay', 'tracked_min_activation', 'tracked_max_activation', 'tracked_min_input', \
'tracked_max_input', 'scale', 'zero_point', 'weight_bit', 'activation_bit']
for attr in del_attr_list:
if hasattr(module, attr):
delattr(module, attr)
......@@ -243,15 +244,26 @@ class QAT_Quantizer(Quantizer):
def quantize_weight(self, wrapper, **kwargs):
config = wrapper.config
module = wrapper.module
input = kwargs['input_tensor']
weight = copy.deepcopy(wrapper.module.old_weight.data)
weight_bits = get_bits_length(config, 'weight')
quant_start_step = config.get('quant_start_step', 0)
assert weight_bits >= 1, "quant bits length should be at least 1"
# we dont update weight in evaluation stage
if quant_start_step > self.bound_model.steps or not wrapper.training:
if quant_start_step > self.bound_model.steps:
module.tracked_min_input, module.tracked_max_input = torch.min(input), torch.max(input)
return weight
if not wrapper.training:
return weight
current_min, current_max = torch.min(input), torch.max(input)
module.tracked_min_input = update_ema(module.tracked_min_input, current_min,
module.ema_decay)
module.tracked_max_input = update_ema(module.tracked_max_input, current_max,
module.ema_decay)
# if bias exists, quantize bias to uint32
if hasattr(wrapper.module, 'bias') and wrapper.module.bias is not None:
bias = wrapper.module.bias.data
......@@ -281,17 +293,17 @@ class QAT_Quantizer(Quantizer):
assert output_bits >= 1, "quant bits length should be at least 1"
if quant_start_step > self.bound_model.steps:
module.tracked_min_biased, module.tracked_max_biased = torch.min(output), torch.max(output)
module.tracked_min_activation, module.tracked_max_activation = torch.min(output), torch.max(output)
return output
# we dont update output quantization parameters in evaluation stage
if wrapper.training:
current_min, current_max = torch.min(output), torch.max(output)
module.tracked_min_biased = update_ema(module.tracked_min_biased, current_min,
module.tracked_min_activation = update_ema(module.tracked_min_activation, current_min,
module.ema_decay)
module.tracked_max_biased = update_ema(module.tracked_max_biased, current_max,
module.tracked_max_activation = update_ema(module.tracked_max_activation, current_max,
module.ema_decay)
module.scale, module.zero_point = update_quantization_param(output_bits, module.tracked_min_biased, module.tracked_max_biased)
module.scale, module.zero_point = update_quantization_param(output_bits, module.tracked_min_activation, module.tracked_max_activation)
out = self._quantize(output_bits, module, output)
out = self._dequantize(module, out)
return out
......@@ -327,10 +339,12 @@ class QAT_Quantizer(Quantizer):
calibration_config[name] = {}
if hasattr(module, 'weight_bit'):
calibration_config[name]['weight_bit'] = int(module.weight_bit)
calibration_config[name]['tracked_min_input'] = float(module.tracked_min_input)
calibration_config[name]['tracked_max_input'] = float(module.tracked_max_input)
if hasattr(module, 'activation_bit'):
calibration_config[name]['activation_bit'] = int(module.activation_bit)
calibration_config[name]['tracked_min'] = float(module.tracked_min_biased)
calibration_config[name]['tracked_max'] = float(module.tracked_max_biased)
calibration_config[name]['tracked_min_activation'] = float(module.tracked_min_activation)
calibration_config[name]['tracked_max_activation'] = float(module.tracked_max_activation)
self._del_simulated_attr(module)
self.export_model_save(self.bound_model, model_path, calibration_config, calibration_path, onnx_path, input_shape, device)
......
......@@ -483,7 +483,7 @@ class QuantizerModuleWrapper(torch.nn.Module):
self.quantizer.quant_grad.apply(
self.module.old_weight,
QuantType.QUANT_WEIGHT,
self)
self, inputs[0])
result = self.module(*inputs)
else:
result = self.module(*inputs)
......@@ -511,14 +511,12 @@ class Quantizer(Compressor):
# and it is trainable, therefore, it should be added to optimizer.
self.optimizer.add_param_group({"params": wrapper.module.old_weight})
def quantize_weight(self, weight, wrapper, **kwargs):
def quantize_weight(self, wrapper, **kwargs):
"""
quantize should overload this method to quantize weight.
This method is effectively hooked to :meth:`forward` of the model.
Parameters
----------
weight : Tensor
weight that needs to be quantized
wrapper : QuantizerModuleWrapper
the wrapper for origin module
"""
......@@ -720,11 +718,11 @@ class QuantGrad(torch.autograd.Function):
return grad_output
@staticmethod
def forward(ctx, tensor, quant_type, wrapper, **kwargs):
def forward(ctx, tensor, quant_type, wrapper, input_tensor=None, **kwargs):
if quant_type == QuantType.QUANT_INPUT:
output = wrapper.quantizer.quantize_input(tensor, wrapper, **kwargs)
elif quant_type == QuantType.QUANT_WEIGHT:
output = wrapper.quantizer.quantize_weight(wrapper, **kwargs)
output = wrapper.quantizer.quantize_weight(wrapper, input_tensor=input_tensor, **kwargs)
elif quant_type == QuantType.QUANT_OUTPUT:
output = wrapper.quantizer.quantize_output(tensor, wrapper, **kwargs)
else:
......
from .integrated_tensorrt import CalibrateType, ModelSpeedupTensorRT
\ No newline at end of file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
class BaseModelSpeedup:
"""
Base speedup class for backend engine
"""
def __init__(self, model, config):
"""
Parameters
----------
model : pytorch model
The model to speed up by quantization.
config : dict
Config recording bit number and name of layers.
"""
self.model = model
self.config = config
def inference(self, test_data):
"""
This function should be overrided by subclass to provide inference ability,
which should return output and inference time.
Parameters
----------
test_data : numpy data
test data given to the inference engine
Returns
-------
numpy data
output data will be generated after inference
float
latency of such inference process
"""
raise NotImplementedError('Backend engine must overload inference()')
def compress(self):
"""
This function should be overrided by subclass to build inference
engine which will be used to process input data
"""
raise NotImplementedError('Backend engine must overload compress()')
def export_quantized_model(self, path):
"""
This function should be overrided by subclass to build inference
engine which will be used to process input data
"""
raise NotImplementedError('Backend engine must overload export_quantized_model()')
\ No newline at end of file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import logging
import tensorrt as trt
import pycuda.driver as cuda
logger = logging.getLogger(__name__)
class Calibrator(trt.IInt8Calibrator):
def __init__(self, training_data, cache_file, batch_size=64, algorithm=trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2):
"""
Parameters
----------
training_data : numpy array
The data using to calibrate quantization model
cache_file : str
The path user want to store calibrate cache file
batch_size : int
The batch_size of calibrating process
algorithm : tensorrt.tensorrt.CalibrationAlgoType
The algorithms of calibrating contains LEGACY_CALIBRATION,
ENTROPY_CALIBRATION, ENTROPY_CALIBRATION_2, MINMAX_CALIBRATION.
Please refer to https://docs.nvidia.com/deeplearning/tensorrt/api/
python_api/infer/Int8/Calibrator.html for detail
"""
trt.IInt8Calibrator.__init__(self)
self.algorithm = algorithm
self.cache_file = cache_file
self.data = training_data
self.batch_size = batch_size
self.current_index = 0
# Allocate enough memory for a whole batch.
self.device_input = cuda.mem_alloc(self.data[0].nbytes * self.batch_size)
def get_algorithm(self):
return self.algorithm
def get_batch_size(self):
return self.batch_size
def get_batch(self, names):
"""
This function is used to define the way of feeding calibrating data each batch.
Parameters
----------
names : str
The names of the network inputs for each object in the bindings array
Returns
-------
list
A list of device memory pointers set to the memory containing each network
input data, or an empty list if there are no more batches for calibration.
You can allocate these device buffers with pycuda, for example, and then
cast them to int to retrieve the pointer
"""
if self.current_index + self.batch_size > self.data.shape[0]:
return None
current_batch = int(self.current_index / self.batch_size)
if current_batch % 10 == 0:
logger.info("Calibrating batch %d, containing %d images", current_batch, self.batch_size)
batch = self.data[self.current_index:self.current_index + self.batch_size].ravel()
cuda.memcpy_htod(self.device_input, batch)
self.current_index += self.batch_size
memory_pointers = [self.device_input]
return memory_pointers
def read_calibration_cache(self):
"""
If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
Returns
-------
cache object
A cache object which contains calibration parameters for quantization
"""
if os.path.exists(self.cache_file):
with open(self.cache_file, "rb") as f:
return f.read()
def write_calibration_cache(self, cache):
"""
Write calibration cache to specific path.
Parameters
----------
cache : str
The calibration cache to write
"""
with open(self.cache_file, "wb") as f:
f.write(cache)
\ No newline at end of file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
import onnx
import onnx.numpy_helper
"""
The main function of this page is to convert pytorch model to onnx model.
Convertion from pytorch model to onnx model is primary so that a critical
problem is caused that Layer name of pytorch model fail to convert to onnx
layer name directly. To solve it, we wrap pytorch model in new wrapper which
multiply bit number and input before computation of each op. Only in this
way can onnx model get bit number of corresponded layer.
"""
class LayernameModuleWrapper(torch.nn.Module):
def __init__(self, module, module_bit) -> None:
"""
Parameters
----------
module : torch.nn.Module
Layer module of pytorch model
module_bit : int
Bit width setting for module
"""
super().__init__()
self.module = module
self.module_bit = module_bit
def forward(self, inputs):
inputs = inputs*self.module_bit
inputs = self.module(inputs)
return inputs
def _setattr(model, name, module):
"""
Parameters
----------
model : pytorch model
The model to speed up by quantization
name : str
name of pytorch module
module : torch.nn.Module
Layer module of pytorch model
"""
name_list = name.split(".")
for name in name_list[:-1]:
model = getattr(model, name)
setattr(model, name_list[-1], module)
def unwrapper(model_onnx, index2name, config):
"""
Fill onnx config and remove wrapper node in onnx
Parameters
----------
model_onnx : onnx model
Onnx model which is converted from pytorch model
index2name : dict
Dictionary of layer index and name
config : dict
Config recording name of layers and calibration parameters
Returns
-------
onnx model
Onnx model which is converted from pytorch model
dict
The configuration of onnx model layers and calibration parameters
"""
# Support Gemm, Conv, Relu, Clip(Relu6) and Maxpool
support_op = ['Gemm', 'Conv', 'Relu', 'Clip', 'MaxP']
idx = 0
onnx_config = {}
while idx < len(model_onnx.graph.node):
nd = model_onnx.graph.node[idx]
if nd.name[0:4] in support_op and idx > 1:
# Grad constant node and multiply node
const_nd = model_onnx.graph.node[idx-2]
mul_nd = model_onnx.graph.node[idx-1]
# Get index number which is transferred by constant node
index = int(onnx.numpy_helper.to_array(const_nd.attribute[0].t))
if index != -1:
name = index2name[index]
onnx_config[nd.name] = config[name]
nd.input[0] = mul_nd.input[0]
# Remove constant node and multiply node
model_onnx.graph.node.remove(const_nd)
model_onnx.graph.node.remove(mul_nd)
idx = idx-2
idx = idx+1
return model_onnx, onnx_config
def torch_to_onnx(model, config, input_shape, model_path, input_names, output_names):
"""
Convert torch model to onnx model and get layer bit config of onnx model.
Parameters
----------
model : pytorch model
The model to speed up by quantization
config : dict
Config recording bit number and name of layers
input_shape : tuple
The input shape of model, shall pass it to torch.onnx.export
model_path : str
The path user want to store onnx model which is converted from pytorch model
input_names : list
Input name of onnx model providing for torch.onnx.export to generate onnx model
output_name : list
Output name of onnx model providing for torch.onnx.export to generate onnx model
Returns
-------
onnx model
Onnx model which is converted from pytorch model
dict
The configuration of onnx model layers and calibration parameters
"""
# Support Gemm, Conv, Relu, Clip(Relu6) and MaxPool
support_op = [torch.nn.Conv2d, torch.nn.Linear, torch.nn.ReLU, torch.nn.ReLU6, torch.nn.MaxPool2d]
# Transfer bit number to onnx layer by using wrapper
index2name = {}
name2index = {}
if config is not None:
for i, name in enumerate(config.keys()):
index2name[i] = name
name2index[name] = i
for name, module in model.named_modules():
if config is not None and name in config:
assert type(module) in support_op
wrapper_module = LayernameModuleWrapper(module, name2index[name])
_setattr(model, name, wrapper_module)
elif type(module) in support_op:
wrapper_module = LayernameModuleWrapper(module, -1)
_setattr(model, name, wrapper_module)
# Convert torch model to onnx model and save it in model_path
dummy_input = torch.randn(input_shape)
model.to('cpu')
torch.onnx.export(model, dummy_input, model_path, verbose=False, input_names=input_names, output_names=output_names, export_params=True)
# Load onnx model
model_onnx = onnx.load(model_path)
model_onnx, onnx_config = unwrapper(model_onnx, index2name, config)
onnx.save(model_onnx, model_path)
onnx.checker.check_model(model_onnx)
return model_onnx, onnx_config
\ No newline at end of file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import time
import logging
import tensorrt as trt
import numpy as np
import torch
from . import frontend_to_onnx as fonnx
from . import calibrator as calibrator
from . import trt_pycuda as common
from .backend import BaseModelSpeedup
# TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
TRT_LOGGER = trt.Logger()
logger = logging.getLogger(__name__)
class CalibrateType:
LEGACY = trt.CalibrationAlgoType.LEGACY_CALIBRATION
ENTROPY = trt.CalibrationAlgoType.ENTROPY_CALIBRATION
ENTROPY2 = trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2
MINMAX = trt.CalibrationAlgoType.MINMAX_CALIBRATION
Precision_Dict = {
8: trt.float32,
16: trt.float16,
32: trt.float32
}
def valid_config(config=None):
"""
This function validates the bit setting configuration
"""
if config is None:
return
support_bit = [8, 16, 32]
for name in config.keys():
if 'weight_bit' in config[name]:
w_bit = config[name]['weight_bit']
assert w_bit in support_bit, "weight bit should be 8, 16, 32"
if 'activation_bit' in config[name]:
a_bit = config[name]['activation_bit']
assert a_bit in support_bit, "activation bit should be 8, 16, 32"
def handle_gemm(network, layer_idx, config):
"""
This function handles special gemm operation due to layer numbers of gemm changed during pytorch->onnx model convertion.
Parameters
----------
network : tensorrt.INetworkDefinition
Represents a TensorRT Network from which the Builder can build an Engine
layer_idx : int
layer index of gemm
config : dict
Config recording bit number and name of layers
"""
layer = network.get_layer(layer_idx)
pre_layer = network.get_layer(layer_idx-1)
next_layer = network.get_layer(layer_idx+1)
# if weight bit exists, set three layers' precision,
# input tensor range and the first two layers' output type
if 'weight_bit' in config[layer.name]:
assert 'tracked_min_input' in config[layer.name]
assert 'tracked_max_input' in config[layer.name]
w_bit = config[layer.name]['weight_bit']
tracked_min_input = config[layer.name]['tracked_min_input']
tracked_max_input = config[layer.name]['tracked_max_input']
# set three layers the same precision
layer.precision = Precision_Dict[w_bit]
pre_layer.precision = Precision_Dict[w_bit]
next_layer.precision = Precision_Dict[w_bit]
# set the first two layers' output type
pre_layer.set_output_type(0, Precision_Dict[w_bit])
layer.set_output_type(0, Precision_Dict[w_bit])
pre_in_tensor = pre_layer.get_input(0)
in_tensor = layer.get_input(0)
next_in_tensor = next_layer.get_input(0)
# set three layers' input tensor range
pre_in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)
in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)
next_in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)
# if activation bit exists, set the last layer's output type output tensor range
if 'activation_bit' in config[layer.name]:
assert 'tracked_min_activation' in config[layer.name]
assert 'tracked_max_activation' in config[layer.name]
a_bit = config[layer.name]['activation_bit']
tracked_min_activation = config[layer.name]['tracked_min_activation']
tracked_max_activation = config[layer.name]['tracked_max_activation']
# set the last layer's output type
next_layer.set_output_type(0, Precision_Dict[a_bit])
next_out_tensor = next_layer.get_output(0)
# set the last layer's output tensor range
next_out_tensor.dynamic_range = (tracked_min_activation, tracked_max_activation)
def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=False, calib=None):
"""
This function builds an engine from an onnx model with calibration process.
Parameters
----------
model_file : str
The path of onnx model
config : dict
Config recording bit number and name of layers
extra_layer_bit : int
Other layers which are not in config will be quantized to corresponding bit number
strict_datatype : bool
Whether constrain layer bit to the number given in config or not. If true, all the layer
will be set to given bit strictly. Otherwise, these layers will be set automatically by
tensorrt
calib : numpy array
The data using to calibrate quantization model
Returns
-------
tensorrt.ICudaEngine
An ICudaEngine for executing inference on a built network
"""
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common.EXPLICIT_BATCH) as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
# Attention that, builder should be set to 1 because of the implementation of allocate_buffer
builder.max_batch_size = 1
builder.max_workspace_size = common.GiB(4)
if extra_layer_bit == 32 and config is None:
pass
elif extra_layer_bit == 16 and config is None:
builder.fp16_mode = True
elif extra_layer_bit == 8 and config is None:
# entire model in 8bit mode
builder.int8_mode = True
else:
builder.int8_mode = True
builder.fp16_mode = True
builder.strict_type_constraints = strict_datatype
valid_config(config)
# Parse onnx model
with open(model_file, 'rb') as model:
if not parser.parse(model.read()):
logger.error('ERROR: Fail to parse the ONNX file.')
for error in range(parser.num_errors):
logger.error(parser.get_error(error))
return None
if calib is not None:
builder.int8_calibrator = calib
# This design may not be correct if output more than one
for i in range(network.num_layers):
if config is None:
break
layer = network.get_layer(i)
if layer.name in config:
w_bit = config[layer.name]['weight_bit']
a_bit = config[layer.name]['activation_bit']
layer.precision = Precision_Dict[w_bit]
layer.set_output_type(0, Precision_Dict[a_bit])
else:
# This implementation may be incorrect when output number > 1
for i in range(network.num_layers):
if config is None:
# no low bit layer need to be set, keep original model
break
layer = network.get_layer(i)
if layer.name not in config:
continue
# layer numbers of gemm changed during pytorch->onnx model convertion, need special handle
if layer.name[0:4] == "Gemm":
handle_gemm(network, i, config)
continue
# If weight_bit exists in config, set layer precision and layer's input tensor dynamic range.
if 'weight_bit' in config[layer.name]:
assert 'tracked_min_input' in config[layer.name]
assert 'tracked_max_input' in config[layer.name]
w_bit = config[layer.name]['weight_bit']
tracked_min_input = config[layer.name]['tracked_min_input']
tracked_max_input = config[layer.name]['tracked_max_input']
layer.precision = Precision_Dict[w_bit]
in_tensor = layer.get_input(0)
in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)
# If activation exists in config, set layer output type and layer's output tensor dynamic range.
if 'activation_bit' in config[layer.name]:
assert 'tracked_min_activation' in config[layer.name]
assert 'tracked_max_activation' in config[layer.name]
a_bit = config[layer.name]['activation_bit']
tracked_min_activation = config[layer.name]['tracked_min_activation']
tracked_max_activation = config[layer.name]['tracked_max_activation']
layer.set_output_type(0, Precision_Dict[a_bit])
out_tensor = layer.get_output(0)
out_tensor.dynamic_range = (tracked_min_activation, tracked_max_activation)
# Build engine and do int8 calibration.
engine = builder.build_cuda_engine(network)
return engine
class ModelSpeedupTensorRT(BaseModelSpeedup):
def __init__(self, model, input_shape, config=None, onnx_path="default_model.onnx", extra_layer_bit=32, strict_datatype=True,
calibrate_type=CalibrateType.ENTROPY2, calib_data_loader=None, calibration_cache = "calibration.cache", batchsize=1,
input_names=["actual_input_1"], output_names=["output1"]):
"""
Parameters
----------
model : pytorch model
The model to speed up by quantization.
input_shape : tuple
The input shape of model, shall pass it to torch.onnx.export.
config : dict
Config recording bit number and name of layers.
onnx_path : str
The path user want to store onnx model which is converted from pytorch model.
extra_layer_bit : int
Other layers which are not in config will be quantized to corresponding bit number.
strict_datatype : bool
Whether constrain layer bit to the number given in config or not. If true, all the layer
will be set to given bit strictly. Otherwise, these layers will be set automatically by
tensorrt.
calibrate_type : tensorrt.tensorrt.CalibrationAlgoType
The algorithm of calibrating. Please refer to https://docs.nvidia.com/deeplearning/
tensorrt/api/python_api/infer/Int8/Calibrator.html for detail
calibrate_data : numpy array
The data using to calibrate quantization model
calibration_cache : str
The path user want to store calibrate cache file
batchsize : int
The batch size of calibration and inference
input_names : list
Input name of onnx model providing for torch.onnx.export to generate onnx model
output_name : list
Output name of onnx model providing for torch.onnx.export to generate onnx model
"""
super().__init__(model, config)
self.model = model
self.onnx_path = onnx_path
self.input_shape = input_shape
self.config = config
self.extra_layer_bit = extra_layer_bit
self.strict_datatype = strict_datatype
self.calibrate_type = calibrate_type
self.calib_data_loader = calib_data_loader
self.calibration_cache = calibration_cache
self.batchsize = batchsize
self.input_names = input_names
self.output_names = output_names
self.context = None
self.onnx_config = {}
def compress(self):
"""
Get onnx config and build tensorrt engine.
"""
assert self.model is not None
assert self.onnx_path is not None
assert self.input_shape is not None
# Convert pytorch model to onnx model and save onnx model in onnx_path
_, self.onnx_config = fonnx.torch_to_onnx(self.model, self.config, input_shape=self.input_shape,
model_path=self.onnx_path, input_names=self.input_names, output_names=self.output_names)
if self.calib_data_loader is not None:
assert self.calibrate_type is not None
context = self._tensorrt_build_withcalib(self.onnx_path)
else:
context = self._tensorrt_build_withoutcalib(self.onnx_path)
self.context = context
def _tensorrt_build_withcalib(self, onnx_path):
"""
Convert pytorch tensor to numpy darray
Parameters
----------
onnx_path : str
The path of onnx model
Returns
-------
tensorrt.IExecutionContext
Context for executing inference using an ICudaEngine
"""
calib_data = None
if type(self.calib_data_loader) == torch.utils.data.dataloader.DataLoader:
calib_data_set = []
for data, _ in self.calib_data_loader:
calib_data_set.append(data)
calib_data = np.concatenate(calib_data_set)
elif type(self.calib_data_loader) == torch.Tensor:
calib_data = self.calib_data_loader.numpy()
else:
raise ValueError("Not support calibration datatype")
calib = calibrator.Calibrator(calib_data, self.calibration_cache, self.batchsize, self.calibrate_type)
# build inference engine with calibration
engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bit, self.strict_datatype, calib)
return engine.create_execution_context()
def _tensorrt_build_withoutcalib(self, onnx_path):
"""
Build inference engine without calibration
Parameters
----------
onnx_path : str
The path of onnx model
Returns
-------
tensorrt.IExecutionContext
Context for executing inference using an ICudaEngine
"""
engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bit, self.strict_datatype)
return engine.create_execution_context()
def inference(self, test_data):
"""
Do inference by tensorrt builded engine.
Parameters
----------
test_data : pytorch tensor
Model input tensor
"""
# convert pytorch tensor to numpy darray
test_data = test_data.numpy()
# Numpy dtype should be float32
assert test_data.dtype == np.float32
elapsed_time = 0
inputs, outputs, bindings, stream = common.allocate_buffers(self.context.engine)
result = []
for start_idx in range(0, test_data.shape[0], self.batchsize):
# If the number of images in the test set is not divisible by the batch size, the last batch will be smaller.
# This logic is used for handling that case.
end_idx = min(start_idx + self.batchsize, test_data.shape[0])
effective_batch_size = end_idx - start_idx
# Do inference for every batch.
inputs[0].host = test_data[start_idx:start_idx + effective_batch_size]
t1 = time.time()
[output] = common.do_inference_v2(self.context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
elapsed_time += time.time() - t1
shape = output.shape[0]
output = output[0:int(shape * effective_batch_size / self.batchsize)].reshape(effective_batch_size, -1)
result.append(output.copy())
# Use argmax to get predictions and then check accuracy
# convert numpy darray to pytorch tensor
result = torch.Tensor(np.concatenate(result))
return result, elapsed_time
def export_quantized_model(self, path):
"""
Export TensorRT quantized model engine which only can be loaded by TensorRT deserialize API.
Parameters
----------
path : str
The path of export model
"""
assert path is not None
with open(path, "wb") as f:
f.write(self.context.engine.serialize())
logger.info("TensorRT engine has been saved to %s", path)
def load_quantized_model(self, path):
"""
Load TensorRT quantized model engine from specific path.
Parameters
----------
path : str
The path of export model
"""
assert path is not None
with open(path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
self.context = engine.create_execution_context()
logger.info("Load TensorRT engine from %s successfully.", path)
\ No newline at end of file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import pycuda.driver as cuda
import pycuda.autoinit # pylint: disable=unused-import
import tensorrt as trt
EXPLICIT_BATCH = 1
def GiB(val):
return val * 1 << 30
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
"""
This function builds an engine from an onnx model with calibration process.
Parameters
----------
host_mem : host memory
Memory buffers of host
device_mem : device memory
Memory buffers of device
"""
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
"""
Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
Parameters
----------
engine : tensorrt.ICudaEngine
An ICudaEngine for executing inference on a built network
Returns
-------
list
All input HostDeviceMem of an engine
list
All output HostDeviceMem of an engine
GPU bindings
Device bindings
GPU stream
A stream is a sequence of commands (possibly issued by different host threads) that execute in order
"""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
\ No newline at end of file
......@@ -45,6 +45,6 @@ enable= unused-wildcard-import,
ignore-patterns=test*
# List of members which are set dynamically and missed by pylint inference
generated-members=numpy.*,torch.*,tensorflow.*
generated-members=numpy.*,torch.*,tensorflow.*,pycuda.*,tensorrt.*
ignored-modules=tensorflow,_winapi,msvcrt
ignored-modules=tensorflow,_winapi,msvcrt,tensorrt,pycuda
......@@ -239,15 +239,16 @@ class CompressorTestCase(TestCase):
# test quantize
# range not including 0
eps = 1e-7
input = torch.tensor([[0, 4], [2, 1]]).float()
weight = torch.tensor([[1, 2], [3, 5]]).float()
model.conv2.module.old_weight.data = weight
quantizer.quantize_weight(model.conv2)
quantizer.quantize_weight(model.conv2, input_tensor=input)
assert math.isclose(model.conv2.module.scale, 5 / 255, abs_tol=eps)
assert model.conv2.module.zero_point == 0
# range including 0
weight = torch.tensor([[-1, 2], [3, 5]]).float()
model.conv2.module.old_weight.data = weight
quantizer.quantize_weight(model.conv2)
quantizer.quantize_weight(model.conv2, input_tensor=input)
assert math.isclose(model.conv2.module.scale, 6 / 255, abs_tol=eps)
assert model.conv2.module.zero_point in (42, 43)
# test value of weight and bias after quantization
......@@ -257,7 +258,7 @@ class CompressorTestCase(TestCase):
bias_valid = torch.tensor([2.3432, 3.4342, 1.3414, 5.2341])
model.conv2.module.old_weight.data = weight
model.conv2.module.bias.data = bias
quantizer.quantize_weight(model.conv2)
quantizer.quantize_weight(model.conv2, input_tensor=input)
assert torch.all(torch.isclose(model.conv2.module.weight.data, weight_valid, rtol=1e-4))
assert torch.all(torch.isclose(model.conv2.module.bias.data, bias_valid, rtol=1e-7))
......@@ -265,14 +266,14 @@ class CompressorTestCase(TestCase):
eps = 1e-7
x = torch.tensor([[-0.2, 0], [0.1, 0.2]])
out = model.relu(x)
assert math.isclose(model.relu.module.tracked_min_biased, 0, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_max_biased, 0.002, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_min_activation, 0, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_max_activation, 0.002, abs_tol=eps)
quantizer.step_with_optimizer()
x = torch.tensor([[0.2, 0.4], [0.6, 0.8]])
out = model.relu(x)
assert math.isclose(model.relu.module.tracked_min_biased, 0.002, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_max_biased, 0.00998, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_min_activation, 0.002, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_max_activation, 0.00998, abs_tol=eps)
def test_torch_quantizer_export(self):
config_list_qat = [{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment