deployer_lib.py

#!/usr/bin/python

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. 
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
# See the License for the specific language governing permissions and
# limitations under the License. 


import os
import sys
import time
import json
import torch
import argparse
import statistics
from collections import Counter


torch_type_to_triton_type = {
    torch.bool:     'TYPE_BOOL', 
    torch.int8:     'TYPE_INT8', 
    torch.int16:    'TYPE_INT16', 
    torch.int32:    'TYPE_INT32', 
    torch.int64:    'TYPE_INT64', 
    torch.uint8:    'TYPE_UINT8', 
    torch.float16:  'TYPE_FP16', 
    torch.float32:  'TYPE_FP32', 
    torch.float64:  'TYPE_FP64'
}


CONFIG_TEMPLATE = r"""
name: "{model_name}"
platform: "{platform}"
max_batch_size: {max_batch_size}
input [
    {spec_inputs}
]
output [
    {spec_outputs}
]
{dynamic_batching}
{model_optimizations}
instance_group [
    {{
        count: {engine_count}
        kind: KIND_GPU
        gpus: [ {gpu_list} ]
    }}
]"""


INPUT_TEMPLATE = r"""
{{
    name: "input__{num}"
    data_type: {type}
    dims: {dims}
    {reshape}
}},"""


OUTPUT_TEMPLATE = r""" 
{{
    name: "output__{num}"
    data_type: {type}
    dims: {dims}
    {reshape}
}},"""


MODEL_OPTIMIZATION_TEMPLATE = r"""
optimization {{
  {execution_accelerator}
  cuda {{
    graphs: {capture_cuda_graph}
  }}
}}"""


EXECUTION_ACCELERATOR_TEMPLATE = r"""
  execution_accelerators {{
    gpu_execution_accelerator: [
      {{
        name: "tensorrt"
      }}
    ]
  }},"""


def remove_empty_lines(text):
    ''' removes empty lines from text, returns the result '''
    ret = "".join([s for s in text.strip().splitlines(True) if s.strip()])
    return ret


def create_deployer(argv):
    ''' takes a list of arguments, returns a deployer object and the list of unused arguments '''
    parser = argparse.ArgumentParser()
    # required args
    method = parser.add_mutually_exclusive_group(required=True)
    method.add_argument('--ts-script',
                        action='store_true',
                        help='convert to torchscript using torch.jit.script')
    method.add_argument('--ts-trace',
                        action='store_true',
                        help='convert to torchscript using torch.jit.trace')
    method.add_argument('--onnx',
                        action='store_true',
                        help='convert to onnx using torch.onnx.export')
    method.add_argument('--trt',
                        action='store_true',
                        help='convert to trt using tensorrt')
    # triton related args
    arguments = parser.add_argument_group('triton related flags')
    arguments.add_argument('--triton-no-cuda',
                            action='store_true',
                            help='Use the CPU for tracing.')
    arguments.add_argument('--triton-model-name',
                            type=str,
                            default="model",
                            help="exports to appropriate directory structure for TRTIS")
    arguments.add_argument("--triton-model-version",
                            type=int,
                            default=1,
                            help="exports to appropriate directory structure for TRTIS")
    arguments.add_argument("--triton-server-url",
                            type=str,
                            default="localhost:8001",
                            help="exports to appropriate directory structure for TRTIS")
    arguments.add_argument("--triton-max-batch-size",
                            type=int,
                            default=8,
                            help="Specifies the 'max_batch_size' in the TRTIS model config.\
                                  See the TRTIS documentation for more info.")
    arguments.add_argument("--triton-dyn-batching-delay",
                            type=float,
                            default=0,
                            help="Determines the dynamic_batching queue delay in milliseconds(ms) for\
                                  the TRTIS model config. Use '0' or '-1' to specify static batching.\
                                  See the TRTIS documentation for more info.")
    arguments.add_argument("--triton-engine-count",
                            type=int,
                            default=1,
                            help="Specifies the 'instance_group' count value in the TRTIS model config.\
                                  See the TRTIS documentation for more info.")
    arguments.add_argument('--save-dir', type=str, default='./triton_models', help='Saved model directory')
    # optimization args
    arguments = parser.add_argument_group('optimization flags')
    arguments.add_argument("--max_workspace_size",
                            type=int,
                            default=512*1024*1024,
                            help="set the size of the workspace for trt export")
    arguments.add_argument("--trt-fp16",
                            action='store_true',
                            help="trt flag ---- export model in mixed precision mode")
    arguments.add_argument("--capture-cuda-graph",
                            type=int,
                            default=1,
                            help="capture cuda graph for obtaining speedup. possible values: 0, 1. default: 1. ")
    arguments.add_argument('--quantize',
                            action='store_true',
                            help='apply quantization for supported nodes')
    arguments.add_argument('--calibrate',
                            action='store_true',
                            help='apply calibration for supported nodes')
    # remainder args
    arguments.add_argument('model_arguments', nargs=argparse.REMAINDER, help='arguments that will be ignored by deployer lib and will be forwarded to your deployer script')
    # 
    args = parser.parse_args(argv)
    deployer = Deployer(args)
    # 
    return deployer, args.model_arguments[1:]


class DeployerLibrary:
    def __init__(self, args):
        self.args = args
        self.platform = None
    
    def set_platform(self, platform):
        ''' sets the platform
            :: platform :: "pytorch_libtorch" or "onnxruntime_onnx" or "tensorrt_plan"
        '''
        self.platform = platform
    
    def build_trt_engine(self, model_file, shapes):
        ''' takes a path to an onnx file, and shape information, returns a trt engine
            :: model_file :: path to an onnx model
            :: shapes :: dictionary containing min shape, max shape, opt shape for the trt engine
        '''
        import tensorrt as trt
        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
        builder = trt.Builder(TRT_LOGGER)
        builder.fp16_mode = self.args.trt_fp16
        builder.max_batch_size = self.args.triton_max_batch_size
        # 
        config = builder.create_builder_config()
        config.max_workspace_size = self.args.max_workspace_size
        if self.args.trt_fp16:
            config.flags |= 1 << int(trt.BuilderFlag.FP16)
        profile = builder.create_optimization_profile()
        for s in shapes:
            profile.set_shape(s['name'], min=s['min'], opt=s['opt'], max=s['max'])
        config.add_optimization_profile(profile)
        explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
        network = builder.create_network(explicit_batch)
        # 
        with trt.OnnxParser(network, TRT_LOGGER) as parser:
            with open(model_file, 'rb') as model:
                parser.parse(model.read())
                for i in range(parser.num_errors):
                    e = parser.get_error(i)
                    print("||||e", e)
                engine = builder.build_engine(network, config=config)
        return engine
    
    def load_engine(self, engine_filepath):
        ''' loads a trt engine from engine_filepath, returns it '''
        import tensorrt as trt
        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
        with open(engine_filepath, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())
        return engine
    
    def prepare_inputs(self, dataloader, device):
        ''' load sample inputs to device '''
        def _move_to_device(maybe_tensor):
            if torch.is_tensor(maybe_tensor):
                return maybe_tensor.to(device)
            elif isinstance(maybe_tensor, dict):
                return {
                    key: _move_to_device(value)
                    for key, value in maybe_tensor.items()
                }
            elif isinstance(maybe_tensor, list) or isinstance(maybe_tensor, tuple):
                return [_move_to_device(x) for x in maybe_tensor]
            else:
                return maybe_tensor

        inputs = []
        for batch in dataloader:
            batch_d = _move_to_device(batch)
            if not hasattr(batch_d, '__iter__'):
                batch_d = (batch_d,)
            inputs.append(batch_d)

        return inputs
    
    def get_list_of_shapes(self, l, fun):
        ''' returns the list of min/max shapes, depending on fun
            :: l :: list of tuples of tensors
            :: fun :: min or max
        '''
        tensor_tuple = l[0]
        shapes = [list(x.shape) for x in tensor_tuple]
        for tensor_tuple in l:
            assert len(tensor_tuple) == len(shapes), "tensors with varying shape lengths are not supported"
            for i,x in enumerate(tensor_tuple):
                for j in range(len(x.shape)):
                    shapes[i][j] = fun(shapes[i][j], x.shape[j])
        return shapes # a list of shapes
    
    def get_tuple_of_min_shapes(self, l):
        ''' returns the tuple of min shapes 
            :: l :: list of tuples of tensors '''
        shapes = self.get_list_of_shapes(l, min)
        min_batch = 1
        shapes = [[min_batch,*shape[1:]] for shape in shapes]
        shapes = tuple(shapes)
        return shapes # tuple of min shapes
    
    def get_tuple_of_max_shapes(self, l):
        ''' returns the tuple of max shapes 
            :: l :: list of tuples of tensors '''
        shapes = self.get_list_of_shapes(l, max)
        max_batch = max(2,shapes[0][0])
        shapes = [[max_batch,*shape[1:]] for shape in shapes]
        shapes = tuple(shapes)
        return shapes # tuple of max shapes
    
    def get_tuple_of_opt_shapes(self, l):
        ''' returns the tuple of opt shapes 
            :: l :: list of tuples of tensors '''
        counter = Counter()
        for tensor_tuple in l:
            shapes = [tuple(x.shape) for x in tensor_tuple]
            shapes = tuple(shapes)
            counter[shapes] += 1
        shapes = counter.most_common(1)[0][0]
        return shapes # tuple of most common occuring shapes
    
    def get_tuple_of_dynamic_shapes(self, l):
        ''' returns a tuple of dynamic shapes: variable tensor dimensions 
            (for ex. batch size) occur as -1 in the tuple
            :: l :: list of tuples of tensors '''
        tensor_tuple = l[0]
        shapes = [list(x.shape) for x in tensor_tuple]
        for tensor_tuple in l:
            err_msg = "tensors with varying shape lengths are not supported"
            assert len(tensor_tuple) == len(shapes), err_msg
            for i,x in enumerate(tensor_tuple):
                for j in range(len(x.shape)):
                    if shapes[i][j] != x.shape[j] or j == 0:
                        shapes[i][j] = -1
        shapes = tuple(shapes)
        return shapes # tuple of dynamic shapes
    
    def run_models(self, models, inputs):
        ''' run the models on inputs, return the outputs and execution times '''
        ret = []
        for model in models:
            torch.cuda.synchronize()
            time_start = time.time()
            outputs = []
            for input in inputs:
                with torch.no_grad():
                    output = model(*input)
                if type(output) is torch.Tensor:
                    output = [output]
                elif type(output) is dict:
                    output = list(output.items())
                    output.sort(key=lambda x: x[0])
                    output = [x[0] for x in output]
                outputs.append(output)
            torch.cuda.synchronize()
            time_end = time.time()
            t = time_end - time_start
            ret.append(outputs)
            ret.append(t)
        return ret

    def compute_tensor_stats(self, tensor):
        #if tensor is not empty
        if tensor.numel():
            return {'std': tensor.std().item(),
                    'mean': tensor.mean().item(),
                    'max': tensor.max().item(),
                    'min': tensor.min().item(),
            }
        else:
            return {'std': 0,
                    'mean':0,
                    'max': 0,
                    'min': 0,
            }

    def compute_errors(self, outputs_A, outputs_B):
        ''' returns dictionary with errors statistics '''
        device = outputs_A[0][0][0].device
        dtype = outputs_A[0][0][0].dtype
        num_outputs = len(outputs_A[0])
        x_values = [torch.zeros(0, device = device, dtype = dtype) for _ in range(num_outputs)]
        y_values = [torch.zeros(0, device = device, dtype = dtype) for _ in range(num_outputs)]
        d_values = [torch.zeros(0, device = device, dtype = dtype) for _ in range(num_outputs)]
        for output_A,output_B in zip(outputs_A,outputs_B):
            for i,(x,y) in enumerate(zip(output_A, output_B)):
                x = x.view(-1).float()
                y = y.view(-1).float()
                d = abs(x - y)
                x_values[i] = torch.cat((x_values[i], x), 0)
                y_values[i] = torch.cat((y_values[i], y), 0)
                d_values[i] = torch.cat((d_values[i], d), 0)
        Error_stats = [{'Original': self.compute_tensor_stats(x),
                       'Converted': self.compute_tensor_stats(y),
                       'Absolute difference': self.compute_tensor_stats(d),
                           } for x,y,z in zip(x_values, y_values, d_values)]
        return Error_stats
    
    def print_errors(self, Error_stats):
        ''' print various statistcs of Linf errors '''
        print()
        print("conversion correctness test results")
        print("-----------------------------------")
        import pandas as pd
        for i,e in enumerate(Error_stats):
            print(f'Output {i}:')
            print(pd.DataFrame(e))
    
    def write_config(self, config_filename, 
                     input_shapes, input_types, 
                     output_shapes, output_types):
        ''' writes TRTIS config file 
            :: config_filename :: the file to write the config file into
            :: input_shapes :: tuple of dynamic shapes of the input tensors
            :: input_types :: tuple of torch types of the input tensors
            :: output_shapes :: tuple of dynamic shapes of the output tensors
            :: output_types :: tuple of torch types of the output tensors
        '''
        assert self.platform is not None, "error - platform is not set"
        
        config_template = CONFIG_TEMPLATE
        input_template = INPUT_TEMPLATE
        optimization_template = MODEL_OPTIMIZATION_TEMPLATE
        accelerator_template = EXECUTION_ACCELERATOR_TEMPLATE
        
        spec_inputs = r""""""
        for i,(shape,typ) in enumerate(zip(input_shapes,input_types)):
            d = {
                'num' : str(i), 
                'type': torch_type_to_triton_type[typ],
                'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size 
            }
            d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
            spec_inputs += input_template.format_map(d)
        spec_inputs = spec_inputs[:-1]
        
        output_template = OUTPUT_TEMPLATE
        spec_outputs = r""""""
        for i,(shape,typ) in enumerate(zip(output_shapes,output_types)):
            d = {
                'num' : str(i), 
                'type': torch_type_to_triton_type[typ],
                'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size 
            }
            d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
            spec_outputs += output_template.format_map(d)
        spec_outputs = spec_outputs[:-1]
        
        batching_str = ""
        max_batch_size = self.args.triton_max_batch_size
        
        if (self.args.triton_dyn_batching_delay > 0):
            # Use only full and half full batches 
            pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
            
            batching_str = r"""
dynamic_batching {{
    preferred_batch_size: [{0}]
    max_queue_delay_microseconds: {1}
}}""".format(", ".join([str(x) for x in pref_batch_size]), 
                        int(self.args.triton_dyn_batching_delay * 1000.0))
        
        accelerator_str = ""
        if self.platform == 'onnxruntime_onnx':
            accelerator_str = accelerator_template.format_map({})
        
        d = {
            "execution_accelerator":  accelerator_str, 
            "capture_cuda_graph":     str(self.args.capture_cuda_graph)
        }
        optimization_str = optimization_template.format_map(d)
        
        config_values = {
            "model_name":           self.args.triton_model_name, 
            "platform":             self.platform, 
            "max_batch_size":       max_batch_size, 
            "spec_inputs":          spec_inputs, 
            "spec_outputs":         spec_outputs, 
            "dynamic_batching":     batching_str, 
            "model_optimizations" : optimization_str, 
            "gpu_list":         ", ".join([str(x) for x in range(torch.cuda.device_count())]), 
            "engine_count":     self.args.triton_engine_count
        }
        
        # write config 
        with open(config_filename, "w") as file:
            final_config_str = config_template.format_map(config_values)
            final_config_str = remove_empty_lines(final_config_str)
            file.write(final_config_str)


class Deployer:
    def __init__(self, args):
        self.args = args
        self.lib = DeployerLibrary(args)
    
    def deploy(self, dataloader, model):
        ''' deploy the model and test for correctness with dataloader '''
        if self.args.ts_script or self.args.ts_trace:
            self.lib.set_platform("pytorch_libtorch")
            print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
            self.to_triton_torchscript(dataloader, model)
        elif self.args.onnx:
            self.lib.set_platform("onnxruntime_onnx")
            print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
            self.to_triton_onnx(dataloader, model)
        elif self.args.trt:
            self.lib.set_platform("tensorrt_plan")
            print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
            self.to_triton_trt(dataloader, model)
        else:
            assert False, "error"
        print("done")
    
    def to_triton_trt(self, dataloader, model):
        ''' export the model to trt and test correctness on dataloader '''
        import tensorrt as trt
        # setup device
        if self.args.triton_no_cuda:
            device = torch.device('cpu')
        else:
            device = torch.device('cuda')

        assert not self.args.quantize, 'quantize flag not supported by trt'
        assert not self.args.calibrate, 'calibrate flag not supported by trt'

        # prepare model 
        model.to(device)
        model.eval()
        assert not model.training, "internal error - model should be in eval() mode! "
        
        # prepare inputs
        inputs = self.lib.prepare_inputs(dataloader, device)
        
        # generate outputs
        outputs = []
        for input in inputs:
            with torch.no_grad():
                output = model(*input)
            if type(output) is torch.Tensor:
                output = [output]
            outputs.append(output)
        
        # generate input shapes - dynamic tensor shape support 
        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
        
        # generate output shapes - dynamic tensor shape support 
        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
        
        # generate input types 
        input_types = [x.dtype for x in inputs[0]]
        
        # generate output types
        output_types = [x.dtype for x in outputs[0]]
        
        # get input names
        rng = range(len(input_types))
        input_names = ["input__" + str(num) for num in rng]
        
        # get output names
        rng = range(len(output_types))
        output_names = ["output__" + str(num) for num in rng]
        
        # prepare save path
        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
        if not os.path.exists(version_folder):
            os.makedirs(version_folder)
        final_model_path = os.path.join(version_folder, 'model.plan')
        
        # get indices of dynamic input and output shapes
        dynamic_axes = {}
        for input_name,shape in zip(input_names,input_shapes):
            dynamic_axes[input_name] = [i for i,x in enumerate(shape) if x == -1]
        for output_name,shape in zip(output_names,output_shapes):
            dynamic_axes[output_name] = [i for i,x in enumerate(shape) if x == -1]
        
        # export the model to onnx first
        with torch.no_grad():
            torch.onnx.export(model, inputs[0], final_model_path, verbose=False, 
                              input_names=input_names, output_names=output_names, 
                              dynamic_axes=dynamic_axes, opset_version=11)
      
        # get shapes
        min_shapes = self.lib.get_tuple_of_min_shapes(inputs)
        opt_shapes = self.lib.get_tuple_of_opt_shapes(inputs)
        max_shapes = self.lib.get_tuple_of_max_shapes(inputs)
        
        zipped = zip(input_names, min_shapes, opt_shapes, max_shapes)
        shapes = []
        for name,min_shape,opt_shape,max_shape in zipped:
            d = {
                 "name":name, 
                 "min": min_shape, 
                 "opt": opt_shape, 
                 "max": max_shape
                }
            shapes.append(d)
        
        # build trt engine
        engine = self.lib.build_trt_engine(final_model_path, shapes)
        assert engine is not None, " trt export failure "
        
        # write trt engine
        with open(final_model_path, 'wb') as f:
            f.write(engine.serialize())
        
        # load the model
        engine = self.lib.load_engine(final_model_path)
        
        class TRT_model:
            def __init__(self, engine, input_names, output_names, output_types, device):
                self.engine = engine
                self.context = self.engine.create_execution_context()
                self.input_names = input_names
                self.output_names = output_names
                self.output_types = output_types
                self.device = device
            
            def is_dimension_dynamic(self, dim):
                return dim is None or dim <= 0
            
            def is_shape_dynamic(self, shape):
                return any([self.is_dimension_dynamic(dim) for dim in shape])
            
            def __call__(self, *inputs):
                # get input shapes
                input_shapes = [x.shape for x in inputs]
                # bindings
                bindings = [None] * self.engine.num_bindings
                # set input shapes, bind input tensors
                zipped = zip(self.input_names, inputs)
                for key,input in zipped:
                    idx = self.engine.get_binding_index(key)
                    bindings[idx] = input.data_ptr()
                    if self.engine.is_shape_binding(idx) and self.is_shape_dynamic(self.context.get_shape(idx)):
                        self.context.set_shape_input(idx, input)
                    elif self.is_shape_dynamic(self.engine.get_binding_shape(idx)):
                        self.context.set_binding_shape(idx, input.shape)
                assert self.context.all_binding_shapes_specified, "trt error"
                assert self.context.all_shape_inputs_specified, "trt error"
                # calculate output shapes, allocate output tensors and bind them
                outputs = []
                zipped = zip(self.output_names, self.output_types)
                for key,dtype in zipped:
                    idx = self.engine.get_binding_index(key)
                    shape = self.context.get_binding_shape(idx)
                    shape = tuple(shape)
                    assert -1 not in shape, "trt error"
                    tensor = torch.zeros(shape, dtype=dtype, device=self.device)
                    outputs.append(tensor)
                    bindings[idx] = outputs[-1].data_ptr()
                # run inference
                self.context.execute_v2(bindings=bindings)
                # return the result
                if len(outputs) == 1:
                    outputs = outputs[0]
                return outputs
        
        model_trt = TRT_model(engine, input_names, output_names, output_types, device)
        
        # run both models on inputs
        assert not model.training, "internal error - model should be in eval() mode! "
        models = (model, model_trt)
        outputs, time_model, outputs_trt, time_model_trt = self.lib.run_models(models, inputs)
        
        # check for errors
        Error_stats = self.lib.compute_errors(outputs, outputs_trt)
        self.lib.print_errors(Error_stats)
        print('time of error check of native model: ', time_model, 'seconds')
        print('time of error check of trt model: ', time_model_trt, 'seconds')
        print()
        
        # write TRTIS config
        config_filename = os.path.join(model_folder, "config.pbtxt")
        self.lib.write_config(config_filename, 
                              input_shapes, input_types, 
                              output_shapes, output_types)
    
    def name_onnx_nodes(self, model_path):
        '''
        Name all unnamed nodes in ONNX model
            parameter model_path: path  ONNX model
            return: none
        '''
        model = onnx.load(model_path)
        node_id = 0
        for node in model.graph.node:
            if len(node.name) == 0:
                node.name = "unnamed_node_%d" % node_id
            node_id += 1
        # This check partially validates model
        onnx.checker.check_model(model)
        onnx.save(model, model_path)
        # Only inference really checks ONNX model for some issues
        # like duplicated node names
        onnxruntime.InferenceSession(model_path, None)
    
    def to_triton_onnx(self, dataloader, model):
        ''' export the model to onnx and test correctness on dataloader '''
        import onnx as local_onnx
        global onnx
        onnx = local_onnx
        import onnxruntime as local_onnxruntime
        global onnxruntime
        onnxruntime = local_onnxruntime
        # setup device
        if self.args.triton_no_cuda:
            device = torch.device('cpu')
        else:
            device = torch.device('cuda')
        
        if self.args.calibrate:
            assert self.args.quantize, ("calibrate flag not supported "
                                        "without quantize")
        if self.args.quantize:
           try:
               from quantize import quantize, QuantizationMode
           except ImportError as error:
               print('quantize scripts are not present')
               raise error
        
        if self.args.calibrate:
            try:
                import calibrate
            except ImportError as error:
                print('calibrate scripts are not present')
                raise error
        
        # prepare model 
        model.to(device)
        model.eval()
        assert not model.training, "internal error - model should be in eval() mode! "
        
        # prepare inputs
        inputs = self.lib.prepare_inputs(dataloader, device)
        
        # generate outputs
        outputs = []
        for input in inputs:
            with torch.no_grad():
                output = model(*input)
            if type(output) is torch.Tensor:
                output = [output]
            outputs.append(output)
        
        # generate input shapes - dynamic tensor shape support 
        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
        
        # generate output shapes - dynamic tensor shape support 
        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
        
        # generate input types 
        input_types = [x.dtype for x in inputs[0]]
        
        # generate output types
        output_types = [x.dtype for x in outputs[0]]
        
        # get input names
        rng = range(len(input_types))
        input_names = ["input__" + str(num) for num in rng]
        
        # get output names
        rng = range(len(output_types))
        output_names = ["output__" + str(num) for num in rng]
        
        # prepare save path
        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
        if not os.path.exists(version_folder):
            os.makedirs(version_folder)
        final_model_path = os.path.join(version_folder, 'model.onnx')
        
        # get indices of dynamic input and output shapes
        dynamic_axes = {}
        for input_name,input_shape in zip(input_names,input_shapes):
            dynamic_axes[input_name] = [i for i,x in enumerate(input_shape) if x == -1]
        for output_name,output_shape in zip(output_names,output_shapes):
            dynamic_axes[output_name] = [i for i,x in enumerate(output_shape) if x == -1]
        
        # export the model
        assert not model.training, "internal error - model should be in eval() mode! "
        with torch.no_grad():
            torch.onnx.export(model, inputs[0], final_model_path, verbose=False, 
                              input_names=input_names, output_names=output_names, 
                              dynamic_axes=dynamic_axes, opset_version=11)
        
        # syntactic error check
        converted_model = onnx.load(final_model_path)
        # check that the IR is well formed
        onnx.checker.check_model(converted_model)

        # Name unnamed nodes - it helps for some other processing tools
        self.name_onnx_nodes(final_model_path)
        converted_model = onnx.load(final_model_path)
        
        # quantize model
        if self.args.quantize:
            if not self.args.calibrate:
                quantized_model = quantize(
                    converted_model,
                    quantization_mode = QuantizationMode.IntegerOps,
                )
                # check that the IR is well formed
                try:
                    onnx.checker.check_model(quantized_model)
                except onnx.onnx_cpp2py_export.checker.ValidationError as error:
                    # FIXME: It is unclear, why checker fails for quantized model so
                    # this error is ignored currently. Inference works for
                    # some quantized models so lets show warning here
                    print("model check failed with warning: [", error, "]")
                    print("Warning during onnx.checker.check_model in quantized model ignored")
                onnx.save(quantized_model, final_model_path)
            else:

                #assert not self.args.calibrate, 'calibrate flag not supported by ONNX'
                # Parsing command-line arguments
                #parser = argparse.ArgumentParser(description='parsing model and test data set paths')
                #parser.add_argument('--model_path', required=True)
                #parser.add_argument('--dataset_path', required=True)
                #parser.add_argument('--output_model_path', type=str, default='calibrated_quantized_model.onnx')
                #parser.add_argument('--dataset_size', type=int, default=0, help="Number of images or tensors to load. Default is 0 which means all samples")
                #parser.add_argument('--data_preprocess', type=str, required=True, choices=['preprocess_method1', 'preprocess_method2', 'None'], help="Refer to Readme.md for guidance on choosing this option.")
                #args = parser.parse_args()
                #model_path = args.model_path
                #output_model_path = args.output_model_path
                #images_folder = args.dataset_path
                calib_mode = "naive"
                size_limit = 0 # int(args.dataset_size)
                
                # Generating augmented ONNX model
                # FIXME: use proper temporary file path
                augmented_model_path = 'augmented_model.onnx'
                #model = onnx.load(model_path)
                augmented_model = calibrate.augment_graph(converted_model)
                onnx.checker.check_model(augmented_model)
                #onnx.save(augmented_model, final_model_path)
                onnx.save(augmented_model, augmented_model_path)
                
                # Conducting inference
                #session = onnxruntime.InferenceSession(final_model_path, None)
                print(augmented_model_path)
                session = onnxruntime.InferenceSession(augmented_model_path, None)
                #session = onnxruntime.InferenceSession('augmented_modelv3.onnx', None)
                (samples, channels, height, width) = session.get_inputs()[0].shape
                print(session.get_inputs()[0].shape)
                #return
                
                # Generating inputs for quantization
                #if args.data_preprocess == "None":
                #    inputs = load_pb_file(images_folder, args.dataset_size, samples, channels, height, width)
                #else:
                #    inputs = load_batch(images_folder, height, width, args.data_preprocess, size_limit)
                
                import numpy as np
                inputs_calibrate_tmp = inputs[0][0].cpu().numpy()
                
                dict_for_quantization = calibrate.get_intermediate_outputs(
                    final_model_path,
                    session,
                    inputs_calibrate_tmp,
                    calib_mode,
                )
                quantization_params_dict = calibrate.calculate_quantization_params(
                    augmented_model,
                    quantization_thresholds = dict_for_quantization,
                )
                calibrated_quantized_model = quantize(
                    converted_model,
                    quantization_mode = QuantizationMode.QLinearOps,
                    quantization_params = quantization_params_dict,
                )
                onnx.save(calibrated_quantized_model, final_model_path)
                
                print("Calibrated, quantized model saved.")
        
        # load the model
        session = onnxruntime.InferenceSession(final_model_path, None)
        
        class ONNX_model:
            def __init__(self, session, input_names, device):
                self.session = session
                self.input_names = input_names
                        
            def to_numpy(self, tensor):
                return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
            
            def __call__(self, *inputs):
                inp = [(input_name, inputs[i]) for i,input_name in enumerate(self.input_names)]
                inp = {input_name : self.to_numpy(x) for input_name,x in inp}
                outputs = self.session.run(None, inp)
                outputs = [torch.from_numpy(output) for output in outputs]
                outputs = [output.to(device) for output in outputs]
                if len(outputs) == 1:
                    outputs = outputs[0]
                return outputs
        
        # switch to eval mode
        model_onnx = ONNX_model(session, input_names, device)
        
        # run both models on inputs
        assert not model.training, "internal error - model should be in eval() mode! "
        models = (model, model_onnx)
        outputs, time_model, outputs_onnx, time_model_onnx = self.lib.run_models(models, inputs)
        
        # check for errors
        Error_stats = self.lib.compute_errors(outputs, outputs_onnx)
        self.lib.print_errors(Error_stats)
        print('time of error check of native model: ', time_model, 'seconds')
        print('time of error check of onnx model: ', time_model_onnx, 'seconds')
        print()
        
        # write TRTIS config
        config_filename = os.path.join(model_folder, "config.pbtxt")
        self.lib.write_config(config_filename, 
                              input_shapes, input_types, 
                              output_shapes, output_types)
    
    def to_triton_torchscript(self, dataloader, model):
        ''' export the model to torchscript and test correctness on dataloader '''
        # setup device
        if self.args.triton_no_cuda:
            device = torch.device('cpu')
        else:
            device = torch.device('cuda')
        
        # prepare model 
        model.to(device)
        model.eval()
        assert not model.training, "internal error - model should be in eval() mode! "

        #TODO: support quantize
        assert not self.args.quantize, 'quantize flag not supported by torchscript yet'
        
        # prepare inputs
        inputs = self.lib.prepare_inputs(dataloader, device)
        
        # generate input shapes - dynamic tensor shape support 
        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
        
        # generate input types 
        input_types = [x.dtype for x in inputs[0]]
        
        # prepare save path 
        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
        if not os.path.exists(version_folder):
            os.makedirs(version_folder)
        final_model_path = os.path.join(version_folder, 'model.pt')
        
        # convert the model 
        with torch.no_grad():
            if self.args.ts_trace: # trace it 
                model_ts = torch.jit.trace(model, inputs[0])
            if self.args.ts_script: # script it 
                model_ts = torch.jit.script(model)
        
        # save the model 
        torch.jit.save(model_ts, final_model_path)
        
        # load the model 
        model_ts = torch.jit.load(final_model_path)
        model_ts.eval() # WAR for bug : by default, model_ts gets loaded in training mode
        
        # run both models on inputs
        assert not model.training, "internal error - model should be in eval() mode! "
        assert not model_ts.training, "internal error - converted model should be in eval() mode! "
        models = (model, model_ts)
        outputs, time_model, outputs_ts, time_model_ts = self.lib.run_models(models, inputs)
        
        # check for errors
        Error_stats = self.lib.compute_errors(outputs, outputs_ts)
        self.lib.print_errors(Error_stats)
        print('time of error check of native model: ', time_model, 'seconds')
        print('time of error check of ts model: ', time_model_ts, 'seconds')
        print()
        
        # generate output shapes - dynamic tensor shape support 
        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
        
        # generate output types 
        output_types = [x.dtype for x in outputs[0]]
        
        # now we build the config for TRTIS 
        config_filename = os.path.join(model_folder, "config.pbtxt")
        self.lib.write_config(config_filename, 
                              input_shapes, input_types, 
                              output_shapes, output_types)