engine.py

#
# Copyright 2022 The HuggingFace Inc. team.
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
from collections import OrderedDict
from copy import copy

import numpy as np
import tensorrt as trt
import torch
from polygraphy import cuda
from polygraphy.backend.common import bytes_from_path
from polygraphy.backend.trt import CreateConfig, Profile
from polygraphy.backend.trt import (
    engine_from_bytes,
    engine_from_network,
    network_from_onnx_path,
    save_engine,
)
from polygraphy.backend.trt import util as trt_util
import ctypes
from glob import glob
from cuda import cudart

TRT_LOGGER = trt.Logger(trt.Logger.INFO)
trt_util.TRT_LOGGER = TRT_LOGGER


class Engine:
    def __init__(
        self,
        model_name,
        engine_dir,
        onnx_file=None,
    ):
        self.engine_path = os.path.join(engine_dir, model_name + ".plan")
        self.engine = None
        self.context = None
        self.buffers = OrderedDict()
        self.tensors = OrderedDict()

        self.weightNameList = None
        self.refitter = None
        self.onnx_initializers = None
        self.onnx_file = onnx_file
        self.trt_lora_weight = None
        self.trt_lora_weight_mem = None
        self.torch_weight = None

    def __del__(self):
        del self.engine
        del self.context
        del self.buffers
        del self.tensors

    def build(
        self,
        onnx_path,
        fp16,
        input_profile=None,
        enable_preview=False,
        sparse_weights=False,
    ):
        print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
        p = Profile()
        if input_profile:
            for name, dims in input_profile.items():
                assert len(dims) == 3
                p.add(name, min=dims[0], opt=dims[1], max=dims[2])

        preview_features = []
        if enable_preview:
            trt_version = [int(i) for i in trt.__version__.split(".")]
            # FASTER_DYNAMIC_SHAPES_0805 should only be used for TRT 8.5.1 or above.
            if trt_version[0] > 8 or (
                trt_version[0] == 8
                and (
                    trt_version[1] > 5 or (trt_version[1] == 5 and trt_version[2] >= 1)
                )
            ):
                preview_features = [trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]

        engine = engine_from_network(
            network_from_onnx_path(onnx_path),
            config=CreateConfig(
                fp16=fp16,
                profiles=[p],
                preview_features=preview_features,
                sparse_weights=sparse_weights,
            ),
        )
        save_engine(engine, path=self.engine_path)

    def activate(self, plugin_path=""):
        ctypes.cdll.LoadLibrary(plugin_path)
        self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
        self.context = self.engine.create_execution_context()

    def get_shared_memory(self):
        _, device_memory = cudart.cudaMalloc(self.engine.device_memory_size)
        self.device_memory = device_memory
        return self.device_memory

    def set_shared_memory(self, device_memory_size):
        self.context.device_memory = device_memory_size

    def binding_input(self, name, shape):
        idx = self.engine.get_binding_index(name)
        result = self.context.set_binding_shape(idx, shape)
        return result

    def allocate_buffers(self, shape_dict=None, device="cuda"):
        print("Allocate buffers and bindings inputs:")
        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
            binding = self.engine[idx]
            print("binding: ", binding)
            if shape_dict and binding in shape_dict:
                shape = shape_dict[binding]
            else:
                shape = self.engine.get_binding_shape(binding)
            nv_dtype = self.engine.get_binding_dtype(binding)
            dtype_map = {
                trt.DataType.FLOAT: np.float32,
                trt.DataType.HALF: np.float16,
                trt.DataType.INT8: np.int8,
                trt.DataType.INT64: np.int64,
                trt.DataType.BOOL: bool,
            }
            if hasattr(trt.DataType, "INT32"):
                dtype_map[trt.DataType.INT32] = np.int32
            dtype = dtype_map[nv_dtype]
            if self.engine.binding_is_input(binding):
                self.context.set_binding_shape(idx, shape)
            # Workaround to convert np dtype to torch
            np_type_tensor = np.empty(shape=[], dtype=dtype)
            torch_type_tensor = torch.from_numpy(np_type_tensor)
            tensor = torch.empty(tuple(shape), dtype=torch_type_tensor.dtype).to(
                device=device
            )

            print(f"  binding={binding}, shape={shape}, dtype={tensor.dtype}")
            self.tensors[binding] = tensor
            self.buffers[binding] = cuda.DeviceView(
                ptr=tensor.data_ptr(), shape=shape, dtype=dtype
            )

    def infer(self, feed_dict, stream):
        start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
        # shallow copy of ordered dict
        device_buffers = copy(self.buffers)
        for name, buf in feed_dict.items():
            assert isinstance(buf, cuda.DeviceView)
            device_buffers[name] = buf
            self.binding_input(name, buf.shape)
        bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
        noerror = self.context.execute_async_v2(
            bindings=bindings, stream_handle=stream.ptr
        )
        if not noerror:
            raise ValueError(f"ERROR: inference failed.")

        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
            binding = self.engine[idx]
            if not self.engine.binding_is_input(binding):
                shape = self.context.get_binding_shape(idx)
                self.tensors[binding].resize_(tuple(shape))
        return self.tensors