[Quantization speedup]support TensorRT8.0.0 (#3866)

a4760ce8 · lin bin · GitHub · 4b1f46a3 · a4760ce8 · a4760ce8
Unverified Commit a4760ce8 authored Jul 09, 2021 by lin bin Committed by GitHub Jul 09, 2021
2 changed files
--- a/docs/en_US/Compression/QuantizationSpeedup.rst
+++ b/docs/en_US/Compression/QuantizationSpeedup.rst
@@ -50,6 +50,10 @@ CUDA version >= 11.0
 TensorRT version >= 7.2
+Note
+* If you haven't installed TensorRT before or use the old version, please refer to `TensorRT Installation Guide <https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html>`__\  
 Usage
 -----
 quantization aware training:

--- a/nni/compression/pytorch/quantization_speedup/integrated_tensorrt.py
+++ b/nni/compression/pytorch/quantization_speedup/integrated_tensorrt.py
@@ -12,7 +12,8 @@ from . import calibrator as calibrator
 from . import trt_pycuda as common
 from .backend import BaseModelSpeedup
-# TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
+TRT8 = 8
+TRT7 = 7
 TRT_LOGGER = trt.Logger()
 logger = logging.getLogger(__name__)
@@ -120,18 +121,39 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
        An ICudaEngine for executing inference on a built network
    """
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(common.EXPLICIT_BATCH) as network, \
-        trt.OnnxParser(network, TRT_LOGGER) as parser:
+        trt.OnnxParser(network, TRT_LOGGER) as parser, builder.create_builder_config() as trt_config:
        # Attention that, builder should be set to 1 because of the implementation of allocate_buffer
+        trt_version = int(trt.__version__[0])
+        assert trt_version == TRT8 or trt_version == TRT7, "Version of TensorRT is too old, please \
+            update TensorRT to version >= 7.0"
+        if trt_version == TRT7:
+            logger.warning("TensorRT7 is deprecated and may be removed in the following release.")
        builder.max_batch_size = 1
+        if trt_version == TRT8:
+            trt_config.max_workspace_size = common.GiB(4)
+        else:
            builder.max_workspace_size = common.GiB(4)
        if extra_layer_bit == 32 and config is None:
            pass
        elif extra_layer_bit == 16 and config is None:
+            if trt_version == TRT8:
+                trt_config.set_flag(trt.BuilderFlag.FP16)
+            else:
                builder.fp16_mode = True
        elif extra_layer_bit == 8 and config is None:
            # entire model in 8bit mode
+            if trt_version == TRT8:
+                trt_config.set_flag(trt.BuilderFlag.INT8)
+            else:
                builder.int8_mode = True
+        else:
+            if trt_version == TRT8:
+                trt_config.set_flag(trt.BuilderFlag.INT8)
+                trt_config.set_flag(trt.BuilderFlag.FP16)
+                if strict_datatype:
+                    trt_config.set_flag(trt.BuilderFlag.STRICT_TYPES)
            else:
                builder.int8_mode = True
                builder.fp16_mode = True
@@ -148,6 +170,9 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
                return None
        if calib is not None:
+            if trt_version == TRT8:
+                trt_config.int8_calibrator = calib
+            else:
                builder.int8_calibrator = calib
            # This design may not be correct if output more than one
            for i in range(network.num_layers):
@@ -196,7 +221,10 @@ def build_engine(model_file, config=None, extra_layer_bit=32, strict_datatype=Fa
                    out_tensor.dynamic_range = (tracked_min_activation, tracked_max_activation)
        # Build engine and do int8 calibration.
-        engine = builder.build_cuda_engine(network)
+        if trt_version == TRT8:
+            engine = builder.build_engine(network, trt_config)
+        else:
+            engine.builder.build_cuda_engine(network)
        return engine
 class ModelSpeedupTensorRT(BaseModelSpeedup):