[Model Compression Quantization] Unify variable name (#3990)

86335921 · lin bin · GitHub · e5c3ac63 · 86335921 · 86335921
Unverified Commit 86335921 authored Aug 16, 2021 by lin bin Committed by GitHub Aug 16, 2021
6 changed files
--- a/examples/model_compress/quantization/mixed_precision_speedup_mnist.py
+++ b/examples/model_compress/quantization/mixed_precision_speedup_mnist.py
@@ -58,10 +58,10 @@ def post_training_quantization_example(train_loader, test_loader, device):
    model = NaiveModel()

    config = {
-        'conv1':{'weight_bit':8, 'activation_bit':8},
-        'conv2':{'weight_bit':32, 'activation_bit':32},
-        'fc1':{'weight_bit':16, 'activation_bit':16},
-        'fc2':{'weight_bit':8, 'activation_bit':8}
+        'conv1':{'weight_bits':8, 'output_bits':8},
+        'conv2':{'weight_bits':32, 'output_bits':32},
+        'fc1':{'weight_bits':16, 'output_bits':16},
+        'fc2':{'weight_bits':8, 'output_bits':8}
    }

    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
@@ -102,8 +102,10 @@ def quantization_aware_training_example(train_loader, test_loader, device):
    ]

    # finetune the model by using QAT
+    # enable batchnorm folding mode
+    dummy_input = torch.randn(1, 1, 28, 28)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
-    quantizer = QAT_Quantizer(model, configure_list, optimizer)
+    quantizer = QAT_Quantizer(model, configure_list, optimizer, dummy_input=dummy_input)
    quantizer.compress()

    model.to(device)

--- a/nni/algorithms/compression/pytorch/quantization/quantizers.py
+++ b/nni/algorithms/compression/pytorch/quantization/quantizers.py
--- a/nni/compression/pytorch/compressor.py
+++ b/nni/compression/pytorch/compressor.py
@@ -834,7 +834,7 @@ class QuantGrad(torch.autograd.Function):
    @classmethod
    def get_bits_length(cls, config, quant_type):
        """
-        Get bit for quantize config
+        Get bits for quantize config
        Parameters
        ----------
        config : Dict

--- a/nni/compression/pytorch/quantization_speedup/frontend_to_onnx.py
+++ b/nni/compression/pytorch/quantization_speedup/frontend_to_onnx.py
@@ -9,26 +9,26 @@ The main function of this page is to convert pytorch model to onnx model.
 Convertion from pytorch model to onnx model is primary so that a critical
 problem is caused that Layer name of pytorch model fail to convert to onnx
 layer name directly. To solve it, we wrap pytorch model in new wrapper which
-multiply bit number and input before computation of each op. Only in this
-way can onnx model get bit number of corresponded layer.
+multiply bits number and input before computation of each op. Only in this
+way can onnx model get bits number of corresponded layer.
 """

 class LayernameModuleWrapper(torch.nn.Module):
-    def __init__(self, module, module_bit) -> None:
+    def __init__(self, module, module_bits) -> None:
        """
        Parameters
        ----------
        module : torch.nn.Module
            Layer module of pytorch model
-        module_bit : int
-            Bit width setting for module
+        module_bits : int
+            Bits width setting for module
        """
        super().__init__()
        self.module = module
-        self.module_bit = module_bit
+        self.module_bits = module_bits

    def forward(self, inputs):
-        inputs = inputs*self.module_bit
+        inputs = inputs*self.module_bits
        inputs = self.module(inputs)
        return inputs

@@ -93,14 +93,14 @@ def unwrapper(model_onnx, index2name, config):

 def torch_to_onnx(model, config, input_shape, model_path, input_names, output_names):
    """
-    Convert torch model to onnx model and get layer bit config of onnx model.
+    Convert torch model to onnx model and get layer bits config of onnx model.

    Parameters
    ----------
    model : pytorch model
        The model to speed up by quantization
    config : dict
-        Config recording bit number and name of layers
+        Config recording bits number and name of layers
    input_shape : tuple
        The input shape of model, shall pass it to torch.onnx.export
    model_path : str
@@ -119,7 +119,7 @@ def torch_to_onnx(model, config, input_shape, model_path, input_names, output_na
    """
    # Support Gemm, Conv, Relu, Clip(Relu6) and MaxPool
    support_op = [torch.nn.Conv2d, torch.nn.Linear, torch.nn.ReLU, torch.nn.ReLU6, torch.nn.MaxPool2d]
-    # Transfer bit number to onnx layer by using wrapper
+    # Transfer bits number to onnx layer by using wrapper
    index2name = {}
    name2index = {}
    if config is not None:

--- a/nni/compression/pytorch/quantization_speedup/integrated_tensorrt.py
+++ b/nni/compression/pytorch/quantization_speedup/integrated_tensorrt.py
@@ -31,18 +31,18 @@ Precision_Dict = {

 def valid_config(config=None):
    """
-    This function validates the bit setting configuration
+    This function validates the bits setting configuration
    """
    if config is None:
        return
-    support_bit = [8, 16, 32]
+    support_bits = [8, 16, 32]
    for name in config.keys():
-        if 'weight_bit' in config[name]:
-            w_bit = config[name]['weight_bit']
-            assert w_bit in support_bit, "weight bit should be 8, 16, 32"
-        if 'activation_bit' in config[name]:
-            a_bit = config[name]['activation_bit']
-            assert a_bit in support_bit, "activation bit should be 8, 16, 32"
+        if 'weight_bits' in config[name]:
+            w_bits = config[name]['weight_bits']
+            assert w_bits in support_bits, "weight bits should be 8, 16, 32"
+        if 'output_bits' in config[name]:
+            a_bits = config[name]['output_bits']
+            assert a_bits in support_bits, "output bits should be 8, 16, 32"

 def handle_gemm(network, layer_idx, config):
    """
@@ -55,26 +55,26 @@ def handle_gemm(network, layer_idx, config):
    layer_idx : int
        layer index of gemm
    config : dict
-        Config recording bit number and name of layers
+        Config recording bits number and name of layers
    """
    layer = network.get_layer(layer_idx)
    pre_layer = network.get_layer(layer_idx-1)
    next_layer = network.get_layer(layer_idx+1)
-    # if weight bit exists, set three layers' precision,
+    # if weight bits exists, set three layers' precision,
    # input tensor range and the first two layers' output type
-    if 'weight_bit' in config[layer.name]:
+    if 'weight_bits' in config[layer.name]:
        assert 'tracked_min_input' in config[layer.name]
        assert 'tracked_max_input' in config[layer.name]
-        w_bit = config[layer.name]['weight_bit']
+        w_bits = config[layer.name]['weight_bits']
        tracked_min_input = config[layer.name]['tracked_min_input']
        tracked_max_input = config[layer.name]['tracked_max_input']
        # set three layers the same precision
-        layer.precision = Precision_Dict[w_bit]
-        pre_layer.precision = Precision_Dict[w_bit]
-        next_layer.precision = Precision_Dict[w_bit]
+        layer.precision = Precision_Dict[w_bits]
+        pre_layer.precision = Precision_Dict[w_bits]
+        next_layer.precision = Precision_Dict[w_bits]
        # set the first two layers' output type
-        pre_layer.set_output_type(0, Precision_Dict[w_bit])
-        layer.set_output_type(0, Precision_Dict[w_bit])
+        pre_layer.set_output_type(0, Precision_Dict[w_bits])
+        layer.set_output_type(0, Precision_Dict[w_bits])
        pre_in_tensor = pre_layer.get_input(0)
        in_tensor = layer.get_input(0)
        next_in_tensor = next_layer.get_input(0)
@@ -83,20 +83,20 @@ def handle_gemm(network, layer_idx, config):
        in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)
        next_in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)

-    # if activation bit exists, set the last layer's output type output tensor range
-    if 'activation_bit' in config[layer.name]:
-        assert 'tracked_min_activation' in config[layer.name]
-        assert 'tracked_max_activation' in config[layer.name]
-        a_bit = config[layer.name]['activation_bit']
-        tracked_min_activation = config[layer.name]['tracked_min_activation']
-        tracked_max_activation = config[layer.name]['tracked_max_activation']
+    # if output bits exists, set the last layer's output type output tensor range
+    if 'output_bits' in config[layer.name]:
+        assert 'tracked_min_output' in config[layer.name]
+        assert 'tracked_max_output' in config[layer.name]
+        a_bits = config[layer.name]['output_bits']
+        tracked_min_output = config[layer.name]['tracked_min_output']
+        tracked_max_output = config[layer.name]['tracked_max_output']
        # set the last layer's output type
-        next_layer.set_output_type(0, Precision_Dict[a_bit])
+        next_layer.set_output_type(0, Precision_Dict[a_bits])
        next_out_tensor = next_layer.get_output(0)
        # set the last layer's output tensor range
-        next_out_tensor.dynamic_range = (tracked_min_activation, tracked_max_activation)
+        next_out_tensor.dynamic_range = (tracked_min_output, tracked_max_output)

-def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=False, calib=None):
+def build_engine(model_file, config=None, extra_layer_bits=32, strict_datatype=False, calib=None):
    """
    This function builds an engine from an onnx model with calibration process.

@@ -105,12 +105,12 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
    model_file : str
        The path of onnx model
    config : dict
-        Config recording bit number and name of layers
-    extra_layer_bit : int
-        Other layers which are not in config will be quantized to corresponding bit number
+        Config recording bits number and name of layers
+    extra_layer_bits : int
+        Other layers which are not in config will be quantized to corresponding bits number
    strict_datatype : bool
-        Whether constrain layer bit to the number given in config or not. If true, all the layer
-        will be set to given bit strictly. Otherwise, these layers will be set automatically by
+        Whether constrain layer bits to the number given in config or not. If true, all the layer
+        will be set to given bits strictly. Otherwise, these layers will be set automatically by
        tensorrt
    calib : numpy array
        The data using to calibrate quantization model
@@ -135,14 +135,14 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
        else:
            builder.max_workspace_size = common.GiB(4)

-        if extra_layer_bit == 32 and config is None:
+        if extra_layer_bits == 32 and config is None:
            pass
-        elif extra_layer_bit == 16 and config is None:
+        elif extra_layer_bits == 16 and config is None:
            if trt_version == TRT8:
                trt_config.set_flag(trt.BuilderFlag.FP16)
            else:
                builder.fp16_mode = True
-        elif extra_layer_bit == 8 and config is None:
+        elif extra_layer_bits == 8 and config is None:
            # entire model in 8bit mode
            if trt_version == TRT8:
                trt_config.set_flag(trt.BuilderFlag.INT8)
@@ -180,15 +180,15 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
                    break
                layer = network.get_layer(i)
                if layer.name in config:
-                    w_bit = config[layer.name]['weight_bit']
-                    a_bit = config[layer.name]['activation_bit']
-                    layer.precision = Precision_Dict[w_bit]
-                    layer.set_output_type(0, Precision_Dict[a_bit])
+                    w_bits = config[layer.name]['weight_bits']
+                    a_bits = config[layer.name]['output_bits']
+                    layer.precision = Precision_Dict[w_bits]
+                    layer.set_output_type(0, Precision_Dict[a_bits])
        else:
            # This implementation may be incorrect when output number > 1
            for i in range(network.num_layers):
                if config is None:
-                    # no low bit layer need to be set, keep original model
+                    # no low bits layer need to be set, keep original model
                    break
                layer = network.get_layer(i)
                if layer.name not in config:
@@ -198,37 +198,37 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
                    handle_gemm(network, i, config)
                    continue

-                # If weight_bit exists in config, set layer precision and layer's input tensor dynamic range.
-                if 'weight_bit' in config[layer.name]:
+                # If weight_bits exists in config, set layer precision and layer's input tensor dynamic range.
+                if 'weight_bits' in config[layer.name]:
                    assert 'tracked_min_input' in config[layer.name]
                    assert 'tracked_max_input' in config[layer.name]
-                    w_bit = config[layer.name]['weight_bit']
+                    w_bits = config[layer.name]['weight_bits']
                    tracked_min_input = config[layer.name]['tracked_min_input']
                    tracked_max_input = config[layer.name]['tracked_max_input']
-                    layer.precision = Precision_Dict[w_bit]
+                    layer.precision = Precision_Dict[w_bits]
                    in_tensor = layer.get_input(0)
                    in_tensor.dynamic_range = (tracked_min_input, tracked_max_input)

-                # If activation exists in config, set layer output type and layer's output tensor dynamic range.
-                if 'activation_bit' in config[layer.name]:
-                    assert 'tracked_min_activation' in config[layer.name]
-                    assert 'tracked_max_activation' in config[layer.name]
-                    a_bit = config[layer.name]['activation_bit']
-                    tracked_min_activation = config[layer.name]['tracked_min_activation']
-                    tracked_max_activation = config[layer.name]['tracked_max_activation']
-                    layer.set_output_type(0, Precision_Dict[a_bit])
+                # If output exists in config, set layer output type and layer's output tensor dynamic range.
+                if 'output_bits' in config[layer.name]:
+                    assert 'tracked_min_output' in config[layer.name]
+                    assert 'tracked_max_output' in config[layer.name]
+                    a_bits = config[layer.name]['output_bits']
+                    tracked_min_output = config[layer.name]['tracked_min_output']
+                    tracked_max_output = config[layer.name]['tracked_max_output']
+                    layer.set_output_type(0, Precision_Dict[a_bits])
                    out_tensor = layer.get_output(0)
-                    out_tensor.dynamic_range = (tracked_min_activation, tracked_max_activation)
+                    out_tensor.dynamic_range = (tracked_min_output, tracked_max_output)

        # Build engine and do int8 calibration.
        if trt_version == TRT8:
            engine = builder.build_engine(network, trt_config)
        else:
-            engine.builder.build_cuda_engine(network)
+            engine = builder.build_cuda_engine(network)
        return engine

 class ModelSpeedupTensorRT(BaseModelSpeedup):
-    def __init__(self, model, input_shape, config=None, onnx_path="default_model.onnx", extra_layer_bit=32, strict_datatype=True,
+    def __init__(self, model, input_shape, config=None, onnx_path="default_model.onnx", extra_layer_bits=32, strict_datatype=True,
        calibrate_type=CalibrateType.ENTROPY2, calib_data_loader=None, calibration_cache = "calibration.cache", batchsize=1,
        input_names=["actual_input_1"], output_names=["output1"]):
        """
@@ -239,14 +239,14 @@ class ModelSpeedupTensorRT(BaseModelSpeedup):
        input_shape : tuple
            The input shape of model, shall pass it to torch.onnx.export.
        config : dict
-            Config recording bit number and name of layers.
+            Config recording bits number and name of layers.
        onnx_path : str
            The path user want to store onnx model which is converted from pytorch model.
-        extra_layer_bit : int
-            Other layers which are not in config will be quantized to corresponding bit number.
+        extra_layer_bits : int
+            Other layers which are not in config will be quantized to corresponding bits number.
        strict_datatype : bool
-            Whether constrain layer bit to the number given in config or not. If true, all the layer
-            will be set to given bit strictly. Otherwise, these layers will be set automatically by
+            Whether constrain layer bits to the number given in config or not. If true, all the layer
+            will be set to given bits strictly. Otherwise, these layers will be set automatically by
            tensorrt.
        calibrate_type : tensorrt.tensorrt.CalibrationAlgoType
            The algorithm of calibrating. Please refer to https://docs.nvidia.com/deeplearning/
@@ -267,7 +267,7 @@ class ModelSpeedupTensorRT(BaseModelSpeedup):
        self.onnx_path = onnx_path
        self.input_shape = input_shape
        self.config = config
-        self.extra_layer_bit = extra_layer_bit
+        self.extra_layer_bits = extra_layer_bits
        self.strict_datatype = strict_datatype
        self.calibrate_type = calibrate_type
        self.calib_data_loader = calib_data_loader
@@ -327,7 +327,7 @@ class ModelSpeedupTensorRT(BaseModelSpeedup):
        calib = calibrator.Calibrator(calib_data, self.calibration_cache, self.batchsize, self.calibrate_type)

        # build inference engine with calibration
-        engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bit, self.strict_datatype, calib)
+        engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bits, self.strict_datatype, calib)
        return engine.create_execution_context()

    def _tensorrt_build_withoutcalib(self, onnx_path):
@@ -344,7 +344,7 @@ class ModelSpeedupTensorRT(BaseModelSpeedup):
        tensorrt.IExecutionContext
            Context for executing inference using an ICudaEngine
        """
-        engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bit, self.strict_datatype)
+        engine = build_engine(onnx_path, self.onnx_config, self.extra_layer_bits, self.strict_datatype)
        return engine.create_execution_context()

    def inference(self, test_data):

--- a/test/ut/sdk/test_compressor_torch.py
+++ b/test/ut/sdk/test_compressor_torch.py
@@ -49,7 +49,8 @@ class CompressorTestCase(TestCase):
        }]

        model.relu = torch.nn.ReLU()
-        quantizer = torch_quantizer.QAT_Quantizer(model, config_list)
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+        quantizer = torch_quantizer.QAT_Quantizer(model, config_list, optimizer)
        quantizer.compress()
        modules_to_compress = quantizer.get_modules_to_compress()
        modules_to_compress_name = [t[0].name for t in modules_to_compress]
@@ -317,7 +318,9 @@ class CompressorTestCase(TestCase):
            'op_types': ['ReLU']
        }]
        model.relu = torch.nn.ReLU()
-        quantizer = torch_quantizer.QAT_Quantizer(model, config_list)
+
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+        quantizer = torch_quantizer.QAT_Quantizer(model, config_list, optimizer)
        quantizer.compress()

        # test quantize
@@ -350,14 +353,14 @@ class CompressorTestCase(TestCase):
        eps = 1e-7
        x = torch.tensor([[-0.2, 0], [0.1, 0.2]])
        out = model.relu(x)
-        assert math.isclose(model.relu.module.tracked_min_activation, 0, abs_tol=eps)
-        assert math.isclose(model.relu.module.tracked_max_activation, 0.002, abs_tol=eps)
+        assert math.isclose(model.relu.module.tracked_min_output, 0, abs_tol=eps)
+        assert math.isclose(model.relu.module.tracked_max_output, 0.002, abs_tol=eps)

        quantizer.step_with_optimizer()
        x = torch.tensor([[0.2, 0.4], [0.6, 0.8]])
        out = model.relu(x)
-        assert math.isclose(model.relu.module.tracked_min_activation, 0.002, abs_tol=eps)
-        assert math.isclose(model.relu.module.tracked_max_activation, 0.00998, abs_tol=eps)
+        assert math.isclose(model.relu.module.tracked_min_output, 0.002, abs_tol=eps)
+        assert math.isclose(model.relu.module.tracked_max_output, 0.00998, abs_tol=eps)

    def test_torch_quantizer_export(self):
        config_list_qat = [{
@@ -392,7 +395,8 @@ class CompressorTestCase(TestCase):
        for config, quantize_algorithm in zip(config_set, quantize_algorithm_set):
            model = TorchModel()
            model.relu = torch.nn.ReLU()
-            quantizer = quantize_algorithm(model, config)
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+            quantizer = quantize_algorithm(model, config, optimizer)
            quantizer.compress()

            x = torch.rand((1, 1, 28, 28), requires_grad=True)