"megatron/git@developer.sourcefind.cn:wuxk1/megatron-lm.git" did not exist on "2de7ae27ed81c3f35ae3d7bd34acf26fa2fca93e"
Unverified Commit 86335921 authored by lin bin's avatar lin bin Committed by GitHub
Browse files

[Model Compression Quantization] Unify variable name (#3990)

parent e5c3ac63
......@@ -58,10 +58,10 @@ def post_training_quantization_example(train_loader, test_loader, device):
model = NaiveModel()
config = {
'conv1':{'weight_bit':8, 'activation_bit':8},
'conv2':{'weight_bit':32, 'activation_bit':32},
'fc1':{'weight_bit':16, 'activation_bit':16},
'fc2':{'weight_bit':8, 'activation_bit':8}
'conv1':{'weight_bits':8, 'output_bits':8},
'conv2':{'weight_bits':32, 'output_bits':32},
'fc1':{'weight_bits':16, 'output_bits':16},
'fc2':{'weight_bits':8, 'output_bits':8}
}
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
......@@ -102,8 +102,10 @@ def quantization_aware_training_example(train_loader, test_loader, device):
]
# finetune the model by using QAT
# enable batchnorm folding mode
dummy_input = torch.randn(1, 1, 28, 28)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
quantizer = QAT_Quantizer(model, configure_list, optimizer)
quantizer = QAT_Quantizer(model, configure_list, optimizer, dummy_input=dummy_input)
quantizer.compress()
model.to(device)
......
......@@ -834,7 +834,7 @@ class QuantGrad(torch.autograd.Function):
@classmethod
def get_bits_length(cls, config, quant_type):
"""
Get bit for quantize config
Get bits for quantize config
Parameters
----------
config : Dict
......
......@@ -9,26 +9,26 @@ The main function of this page is to convert pytorch model to onnx model.
Convertion from pytorch model to onnx model is primary so that a critical
problem is caused that Layer name of pytorch model fail to convert to onnx
layer name directly. To solve it, we wrap pytorch model in new wrapper which
multiply bit number and input before computation of each op. Only in this
way can onnx model get bit number of corresponded layer.
multiply bits number and input before computation of each op. Only in this
way can onnx model get bits number of corresponded layer.
"""
class LayernameModuleWrapper(torch.nn.Module):
def __init__(self, module, module_bit) -> None:
def __init__(self, module, module_bits) -> None:
"""
Parameters
----------
module : torch.nn.Module
Layer module of pytorch model
module_bit : int
Bit width setting for module
module_bits : int
Bits width setting for module
"""
super().__init__()
self.module = module
self.module_bit = module_bit
self.module_bits = module_bits
def forward(self, inputs):
inputs = inputs*self.module_bit
inputs = inputs*self.module_bits
inputs = self.module(inputs)
return inputs
......@@ -93,14 +93,14 @@ def unwrapper(model_onnx, index2name, config):
def torch_to_onnx(model, config, input_shape, model_path, input_names, output_names):
"""
Convert torch model to onnx model and get layer bit config of onnx model.
Convert torch model to onnx model and get layer bits config of onnx model.
Parameters
----------
model : pytorch model
The model to speed up by quantization
config : dict
Config recording bit number and name of layers
Config recording bits number and name of layers
input_shape : tuple
The input shape of model, shall pass it to torch.onnx.export
model_path : str
......@@ -119,7 +119,7 @@ def torch_to_onnx(model, config, input_shape, model_path, input_names, output_na
"""
# Support Gemm, Conv, Relu, Clip(Relu6) and MaxPool
support_op = [torch.nn.Conv2d, torch.nn.Linear, torch.nn.ReLU, torch.nn.ReLU6, torch.nn.MaxPool2d]
# Transfer bit number to onnx layer by using wrapper
# Transfer bits number to onnx layer by using wrapper
index2name = {}
name2index = {}
if config is not None:
......
......@@ -31,18 +31,18 @@ Precision_Dict = {
def valid_config(config=None):
"""
This function validates the bit setting configuration
This function validates the bits setting configuration
"""
if config is None:
return
support_bit = [8, 16, 32]
support_bits = [8, 16, 32]
for name in config.keys():
if 'weight_bit' in config[name]:
w_bit = config[name]['weight_bit']
assert w_bit in support_bit, "weight bit should be 8, 16, 32"
if 'activation_bit' in config[name]:
a_bit = config[name]['activation_bit']
assert a_bit in support_bit, "activation bit should be 8, 16, 32"
if 'weight_bits' in config[name]:
w_bits = config[name]['weight_bits']
assert w_bits in support_bits, "weight bits should be 8, 16, 32"
if 'output_bits' in config[name]:
a_bits = config[name]['output_bits']
assert a_bits in support_bits, "output bits should be 8, 16, 32"
def handle_gemm(network, layer_idx, config):
"""
......@@ -55,26 +55,26 @@ def handle_gemm(network, layer_idx, config):
layer_idx : int
layer index of gemm
config : dict
Config recording bit number and name of layers
Config recording bits number and name of layers
"""
layer = network.get_layer(layer_idx)
pre_layer = network.get_layer(layer_idx-1)
next_layer = network.get_layer(layer_idx+1)
# if weight bit exists, set three layers' precision,
# if weight bits exists, set three layers' precision,
# input tensor range and the first two layers' output type
if 'weight_bit' in config[layer.name]:
if 'weight_bits' in config[layer.name]:
assert 'tracked_min_input' in config[layer.name]
assert 'tracked_max_input' in config[layer.name]
w_bit = config[layer.name]['weight_bit']
w_bits = config[layer.name]['weight_bits']
tracked_min_input = config[layer.name]['tracked_min_input']
tracked_max_input = config[layer.name]['tracked_max_input']
# set three layers the same precision
layer.precision = Precision_Dict[w_bit]
pre_layer.precision = Precision_Dict[w_bit]
next_layer.precision = Precision_Dict[w_bit]
layer.precision = Precision_Dict[w_bits]
pre_layer.precision = Precision_Dict[w_bits]
next_layer.precision = Precision_Dict[w_bits]
# set the first two layers' output type
pre_layer.set_output_type(0, Precision_Dict[w_bit])
layer.set_output_type(0, Precision_Dict[w_bit])
pre_layer.set_output_type(0, Precision_Dict[w_bits])
layer.set_output_type(0, Precision_Dict[w_bits])
pre_in_tensor = pre_layer.get_input(0)
in_tensor = layer.get_input(0)
next_in_tensor = next_layer.get_input(0)
......@@ -83,20 +83,20 @@ def handle_gemm(network, layer_idx, config):
in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)
next_in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)
# if activation bit exists, set the last layer's output type output tensor range
if 'activation_bit' in config[layer.name]:
assert 'tracked_min_activation' in config[layer.name]
assert 'tracked_max_activation' in config[layer.name]
a_bit = config[layer.name]['activation_bit']
tracked_min_activation = config[layer.name]['tracked_min_activation']
tracked_max_activation = config[layer.name]['tracked_max_activation']
# if output bits exists, set the last layer's output type output tensor range
if 'output_bits' in config[layer.name]:
assert 'tracked_min_output' in config[layer.name]
assert 'tracked_max_output' in config[layer.name]
a_bits = config[layer.name]['output_bits']
tracked_min_output = config[layer.name]['tracked_min_output']
tracked_max_output = config[layer.name]['tracked_max_output']
# set the last layer's output type
next_layer.set_output_type(0, Precision_Dict[a_bit])
next_layer.set_output_type(0, Precision_Dict[a_bits])
next_out_tensor = next_layer.get_output(0)
# set the last layer's output tensor range
next_out_tensor.dynamic_range = (tracked_min_activation, tracked_max_activation)
next_out_tensor.dynamic_range = (tracked_min_output, tracked_max_output)
def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=False, calib=None):
def build_engine(model_file, config=None, extra_layer_bits=32, strict_datatype=False, calib=None):
"""
This function builds an engine from an onnx model with calibration process.
......@@ -105,12 +105,12 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
model_file : str
The path of onnx model
config : dict
Config recording bit number and name of layers
extra_layer_bit : int
Other layers which are not in config will be quantized to corresponding bit number
Config recording bits number and name of layers
extra_layer_bits : int
Other layers which are not in config will be quantized to corresponding bits number
strict_datatype : bool
Whether constrain layer bit to the number given in config or not. If true, all the layer
will be set to given bit strictly. Otherwise, these layers will be set automatically by
Whether constrain layer bits to the number given in config or not. If true, all the layer
will be set to given bits strictly. Otherwise, these layers will be set automatically by
tensorrt
calib : numpy array
The data using to calibrate quantization model
......@@ -135,14 +135,14 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
else:
builder.max_workspace_size = common.GiB(4)
if extra_layer_bit == 32 and config is None:
if extra_layer_bits == 32 and config is None:
pass
elif extra_layer_bit == 16 and config is None:
elif extra_layer_bits == 16 and config is None:
if trt_version == TRT8:
trt_config.set_flag(trt.BuilderFlag.FP16)
else:
builder.fp16_mode = True
elif extra_layer_bit == 8 and config is None:
elif extra_layer_bits == 8 and config is None:
# entire model in 8bit mode
if trt_version == TRT8:
trt_config.set_flag(trt.BuilderFlag.INT8)
......@@ -180,15 +180,15 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
break
layer = network.get_layer(i)
if layer.name in config:
w_bit = config[layer.name]['weight_bit']
a_bit = config[layer.name]['activation_bit']
layer.precision = Precision_Dict[w_bit]
layer.set_output_type(0, Precision_Dict[a_bit])
w_bits = config[layer.name]['weight_bits']
a_bits = config[layer.name]['output_bits']
layer.precision = Precision_Dict[w_bits]
layer.set_output_type(0, Precision_Dict[a_bits])
else:
# This implementation may be incorrect when output number > 1
for i in range(network.num_layers):
if config is None:
# no low bit layer need to be set, keep original model
# no low bits layer need to be set, keep original model
break
layer = network.get_layer(i)
if layer.name not in config:
......@@ -198,37 +198,37 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
handle_gemm(network, i, config)
continue
# If weight_bit exists in config, set layer precision and layer's input tensor dynamic range.
if 'weight_bit' in config[layer.name]:
# If weight_bits exists in config, set layer precision and layer's input tensor dynamic range.
if 'weight_bits' in config[layer.name]:
assert 'tracked_min_input' in config[layer.name]
assert 'tracked_max_input' in config[layer.name]
w_bit = config[layer.name]['weight_bit']
w_bits = config[layer.name]['weight_bits']
tracked_min_input = config[layer.name]['tracked_min_input']
tracked_max_input = config[layer.name]['tracked_max_input']
layer.precision = Precision_Dict[w_bit]
layer.precision = Precision_Dict[w_bits]
in_tensor = layer.get_input(0)
in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)
# If activation exists in config, set layer output type and layer's output tensor dynamic range.
if 'activation_bit' in config[layer.name]:
assert 'tracked_min_activation' in config[layer.name]
assert 'tracked_max_activation' in config[layer.name]
a_bit = config[layer.name]['activation_bit']
tracked_min_activation = config[layer.name]['tracked_min_activation']
tracked_max_activation = config[layer.name]['tracked_max_activation']
layer.set_output_type(0, Precision_Dict[a_bit])
# If output exists in config, set layer output type and layer's output tensor dynamic range.
if 'output_bits' in config[layer.name]:
assert 'tracked_min_output' in config[layer.name]
assert 'tracked_max_output' in config[layer.name]
a_bits = config[layer.name]['output_bits']
tracked_min_output = config[layer.name]['tracked_min_output']
tracked_max_output = config[layer.name]['tracked_max_output']
layer.set_output_type(0, Precision_Dict[a_bits])
out_tensor = layer.get_output(0)
out_tensor.dynamic_range = (tracked_min_activation, tracked_max_activation)
out_tensor.dynamic_range = (tracked_min_output, tracked_max_output)
# Build engine and do int8 calibration.
if trt_version == TRT8:
engine = builder.build_engine(network, trt_config)
else:
engine.builder.build_cuda_engine(network)
engine = builder.build_cuda_engine(network)
return engine
class ModelSpeedupTensorRT(BaseModelSpeedup):
def __init__(self, model, input_shape, config=None, onnx_path="default_model.onnx", extra_layer_bit=32, strict_datatype=True,
def __init__(self, model, input_shape, config=None, onnx_path="default_model.onnx", extra_layer_bits=32, strict_datatype=True,
calibrate_type=CalibrateType.ENTROPY2, calib_data_loader=None, calibration_cache = "calibration.cache", batchsize=1,
input_names=["actual_input_1"], output_names=["output1"]):
"""
......@@ -239,14 +239,14 @@ class ModelSpeedupTensorRT(BaseModelSpeedup):
input_shape : tuple
The input shape of model, shall pass it to torch.onnx.export.
config : dict
Config recording bit number and name of layers.
Config recording bits number and name of layers.
onnx_path : str
The path user want to store onnx model which is converted from pytorch model.
extra_layer_bit : int
Other layers which are not in config will be quantized to corresponding bit number.
extra_layer_bits : int
Other layers which are not in config will be quantized to corresponding bits number.
strict_datatype : bool
Whether constrain layer bit to the number given in config or not. If true, all the layer
will be set to given bit strictly. Otherwise, these layers will be set automatically by
Whether constrain layer bits to the number given in config or not. If true, all the layer
will be set to given bits strictly. Otherwise, these layers will be set automatically by
tensorrt.
calibrate_type : tensorrt.tensorrt.CalibrationAlgoType
The algorithm of calibrating. Please refer to https://docs.nvidia.com/deeplearning/
......@@ -267,7 +267,7 @@ class ModelSpeedupTensorRT(BaseModelSpeedup):
self.onnx_path = onnx_path
self.input_shape = input_shape
self.config = config
self.extra_layer_bit = extra_layer_bit
self.extra_layer_bits = extra_layer_bits
self.strict_datatype = strict_datatype
self.calibrate_type = calibrate_type
self.calib_data_loader = calib_data_loader
......@@ -327,7 +327,7 @@ class ModelSpeedupTensorRT(BaseModelSpeedup):
calib = calibrator.Calibrator(calib_data, self.calibration_cache, self.batchsize, self.calibrate_type)
# build inference engine with calibration
engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bit, self.strict_datatype, calib)
engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bits, self.strict_datatype, calib)
return engine.create_execution_context()
def _tensorrt_build_withoutcalib(self, onnx_path):
......@@ -344,7 +344,7 @@ class ModelSpeedupTensorRT(BaseModelSpeedup):
tensorrt.IExecutionContext
Context for executing inference using an ICudaEngine
"""
engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bit, self.strict_datatype)
engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bits, self.strict_datatype)
return engine.create_execution_context()
def inference(self, test_data):
......
......@@ -49,7 +49,8 @@ class CompressorTestCase(TestCase):
}]
model.relu = torch.nn.ReLU()
quantizer = torch_quantizer.QAT_Quantizer(model, config_list)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
quantizer = torch_quantizer.QAT_Quantizer(model, config_list, optimizer)
quantizer.compress()
modules_to_compress = quantizer.get_modules_to_compress()
modules_to_compress_name = [t[0].name for t in modules_to_compress]
......@@ -317,7 +318,9 @@ class CompressorTestCase(TestCase):
'op_types': ['ReLU']
}]
model.relu = torch.nn.ReLU()
quantizer = torch_quantizer.QAT_Quantizer(model, config_list)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
quantizer = torch_quantizer.QAT_Quantizer(model, config_list, optimizer)
quantizer.compress()
# test quantize
......@@ -350,14 +353,14 @@ class CompressorTestCase(TestCase):
eps = 1e-7
x = torch.tensor([[-0.2, 0], [0.1, 0.2]])
out = model.relu(x)
assert math.isclose(model.relu.module.tracked_min_activation, 0, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_max_activation, 0.002, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_min_output, 0, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_max_output, 0.002, abs_tol=eps)
quantizer.step_with_optimizer()
x = torch.tensor([[0.2, 0.4], [0.6, 0.8]])
out = model.relu(x)
assert math.isclose(model.relu.module.tracked_min_activation, 0.002, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_max_activation, 0.00998, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_min_output, 0.002, abs_tol=eps)
assert math.isclose(model.relu.module.tracked_max_output, 0.00998, abs_tol=eps)
def test_torch_quantizer_export(self):
config_list_qat = [{
......@@ -392,7 +395,8 @@ class CompressorTestCase(TestCase):
for config, quantize_algorithm in zip(config_set, quantize_algorithm_set):
model = TorchModel()
model.relu = torch.nn.ReLU()
quantizer = quantize_algorithm(model, config)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
quantizer = quantize_algorithm(model, config, optimizer)
quantizer.compress()
x = torch.rand((1, 1, 28, 28), requires_grad=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment