add model_3

fe436c6d · root · 347dae81 · fe436c6d · fe436c6d · fe436c6d
Commit fe436c6d authored Dec 16, 2024 by root
15 changed files
--- a/bytedance/models/model_1/mxr2/logs/DCU-migraphx-driver-64.log
+++ b/bytedance/models/model_1/mxr2/logs/DCU-migraphx-driver-64.log
--- a/bytedance/models/model_1/mxr2/logs/DCU-migraphx-driver-8.log
+++ b/bytedance/models/model_1/mxr2/logs/DCU-migraphx-driver-8.log
--- a/bytedance/requirements.txt
+++ b/bytedance/requirements.txt
+onnx==1.17.0
+onnxruntime==1.20.1
+onnxsim==0.4.36
+tf2onnx==1.16.1
+numpy==1.24.3
\ No newline at end of file
--- a/bytedance/src/migraphx-driver.sh
+++ b/bytedance/src/migraphx-driver.sh
+#!/bin/bash
+model_path=/home/workspace/ByteMLPerf/bytedance/models/model_1/mxr1/
+export HIP_VISIBLE_DEVICES=0
+for bs in 1 2 4 8 16 32 64 128 256 512 1024 2048; do
+    echo "====== batch size: ${bs} ======"
+    /opt/dtk/bin/migraphx-driver perf --migraphx ${model_path}/model-static-batch-size-${bs}.mxr > ${model_path}logs/DCU-migraphx-driver-${bs}.log
+done
--- a/bytedance/src/migraphx-infer.py
+++ b/bytedance/src/migraphx-infer.py
+import migraphx
+import json
+import csv
+import numpy as np
+import time
+from tqdm import tqdm
+
+DTYPE={
+     
+    "float32": np.float32,
+    "float64": np.float64,
+    "int32": np.int32,
+    "int64": np.int64,
+    "uint8": np.uint8,
+    "uint16": np.uint16,
+    "uint32": np.uint32,
+    "uint64": np.uint64,
+    "int8": np.int8,
+    "int16": np.int16,
+}
+def read_csv_data(file_path):
+    with open(file_path, 'r') as f:
+        reader = csv.reader(f)
+        next(reader)
+        datas = list(reader)
+    for data in datas:
+        data[2] = data[2][1:-1].split(",")
+    names_dtype = {data[1]:data[-1] for data in datas}
+    return names_dtype
+def load_datasets(datasets_path):
+    with open(datasets_path, 'r') as f:
+        datasets = json.load(f)
+    return datasets
+
+def AllocateteOutputMemory(model):
+    outputData={}
+    for key in model.get_outputs().keys():
+        outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
+
+    return outputData
+
+if __name__ == "__main__":
+    input_names_dtype = read_csv_data("./new_models/model_1/input_tensors.csv")
+    complie = False
+    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
+        dataset_path = './new_models/model_1/dataset/input_tensor_datas_'+str(batch_size)+'.json'
+        model_path = './new_models/model_1/onnx/model-static-batch-size-'+str(batch_size)+'.onnx'
+        input_datasets = load_datasets(dataset_path)
+        input_datas = {key: np.array(value).astype(DTYPE[input_names_dtype[key]]) for key, value in input_datasets.items()}
+        
+        if complie:
+            model = migraphx.parse_onnx(model_path)
+            print(f"compile {model_path}")
+            model.compile(migraphx.get_target("gpu"), offload_copy=False, device_id=0)
+            print(f"./model_1/onnx/model-static-batch-size-{batch_size}.mxr")
+            migraphx.save(model, f"./model_1/mxr/model-static-batch-size-{batch_size}.mxr")
+        else:
+            print(f"./new_models/model_1/onnx/model-static-batch-size-{batch_size}.mxr")
+            model = migraphx.load( f"./new_models/model_1/mxr/model-static-batch-size-{batch_size}.mxr")
+            modelData = AllocateteOutputMemory(model)
+            for key, _ in input_datas.items():
+                modelData[key] = migraphx.to_gpu(migraphx.argument(input_datas[key]))
+            for i in range(1100):
+                if i < 100:
+                    times = time.time()
+                model.run(modelData)
+            print("*******batch_size: ", batch_size, "*******QPS: ", 1000/(time.time() - times)*batch_size)
--- a/bytedance/src/onnx2mxr.py
+++ b/bytedance/src/onnx2mxr.py
+import migraphx
+import os
+
+
+def main():
+    
+    onnx_model_dir = "./models/model_1/onnx"
+    mxr_model_dir = "./models/model_1/mxr2"
+    
+    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
+        model_path = os.path.join(onnx_model_dir, f"model-static-batch-size-{batch_size}.onnx")
+        model = migraphx.parse_onnx(model_path)
+        print(f"compile {model_path}")
+        model.compile(migraphx.get_target("gpu"), offload_copy=False, device_id=0)
+        
+        migraphx.save(model, os.path.join(mxr_model_dir, f"model-static-batch-size-{batch_size}.mxr"))
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/bytedance/src/onnx2mxr.sh
+++ b/bytedance/src/onnx2mxr.sh
+#!/bin/bash
+onnx_model_path=/home/workspace/ByteMLPerf/bytedance/models/model_1/onnx/
+mxr_model_path=/home/workspace/ByteMLPerf/bytedance/models/model_1/mxr1/
+export HIP_VISIBLE_DEVICES=0
+for bs in 1 2 4 8 16 32 64 128 256 512 1024 2048; do
+    echo "====== batch size: ${bs} ======"
+    /opt/dtk/bin/migraphx-driver compile --binary --output ${mxr_model_path}/model-static-batch-size-${bs}.mxr --onnx ${onnx_model_path}/model-static-batch-size-${bs}.onnx
+done
\ No newline at end of file
--- a/bytedance/src/onnx2trt.sh
+++ b/bytedance/src/onnx2trt.sh
+#!/bin/bash
+
+onnx_model_path=/datav/sunzhq/workspace/bytedance/new_models/model_1/onnx/
+trt_model_path=/datav/sunzhq/workspace/bytedance/new_models/model_1/trt/
+export CUDA_VISIBLE_DEVICES=0
+for bs in 1 2 4 8 16 32 64 128 256 512 1024 2048; do
+    echo "====== batch size: ${bs} ======"
+    trtexec --onnx=${onnx_model_path}model-static-batch-size-${bs}.onnx --saveEngine=${trt_model_path}model-static-batch-size-${bs}.trt
+done
\ No newline at end of file
--- a/bytedance/src/tf_session_infer.py
+++ b/bytedance/src/tf_session_infer.py
+import tensorflow.compat.v1 as tf
+import csv
+from copy import deepcopy
+import time
+import json
+import numpy as np
+import string
+import random
+import logging
+import os
+from datetime import datetime
+
+# 配置日志记录器
+
+
+tf.disable_v2_behavior()
+TF_XLA_FLAGS="--tf_xla_auto_jit=1"
+
+DTYPE = {
+    'float32': tf.float32, 
+    'int32': tf.int32, 
+    'int64': tf.int64,
+    'string': tf.string
+}
+
+
+def create_directory(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+        print(f"Directory '{path}' created.")
+    else:
+        print(f"Directory '{path}' already exists.")
+
+
+def read_csv_data(file_path):
+    with open(file_path, 'r') as f:
+        reader = csv.reader(f)
+        next(reader)
+        datas = list(reader)
+    for data in datas:
+        data[2] = data[2][1:-1].split(",")
+    
+    return datas
+
+
+def load_graph(model_file):
+    with tf.gfile.GFile(model_file, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name="")
+    
+    return graph
+ 
+    
+def generate_graph_datas(graph, tensors, batch_size):
+    graph_datas = {}
+    graph_names = {}
+    for tensor in tensors:
+        graph_names[tensor[1].split(':')[0]] = graph.get_tensor_by_name(tensor[1])
+        _dtype = DTYPE[tensor[-1]]
+        shapes = deepcopy(tensor[2])
+        if shapes == ['']:
+            graph_datas[graph.get_tensor_by_name(tensor[1])] = tf.constant(3.14, dtype=_dtype)
+            continue
+        if shapes[0] == "None":
+            shapes[0] = batch_size
+        for i in range(len(shapes)):
+            if shapes[i] == "None":
+                shapes[i] = batch_size
+            if shapes[i] != "":
+                shapes[i] = int(shapes[i])
+        
+        shapes = tuple([i for i in shapes])
+       
+        if tensor[-1] == "int32":
+            random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
+
+        elif tensor[-1] == "int64":
+            random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
+    
+        elif tensor[-1] == "string":
+            # 生成字符串张量
+            batch_size = shapes[0]
+            sequence_length = shapes[1] if len(shapes) > 1 else 1
+            random_tensor = tf.constant([["example_string"] * sequence_length] * batch_size, dtype=tf.string)
+        else:
+            random_tensor = tf.random.normal(shape=shapes, mean=0.0, stddev=1.0, dtype=_dtype)
+
+        
+        graph_datas[graph.get_tensor_by_name(tensor[1])] = random_tensor
+
+    return graph_datas, graph_names       
+
+
+def load_datasets(datasets_path):
+    with open(datasets_path, 'r') as f:
+        datasets = json.load(f)
+    return datasets
+
+
+def main():
+    model = "model_1"
+    model_dir = "./models"
+    random_flag = False
+    
+    logs_dir = os.path.join(model_dir, f"{model}/logs")
+    create_directory(logs_dir)
+    
+    logging.basicConfig(filename=os.path.join(logs_dir, f"TF-{model}-{datetime.now().strftime('%Y%m%d%H%M%S')}.log"),
+                        filemode='a', 
+                        format='%(asctime)s - %(levelname)s - %(message)s',
+                        level=logging.INFO)
+    
+    input_tensors_path = os.path.join(model_dir, f'{model}/input_tensors.csv')
+    output_tensors_path = os.path.join(model_dir, f'{model}/output_tensors.csv')
+    model_path = os.path.join(model_dir, f'{model}/model.pb')
+    
+    
+    input_tensors = read_csv_data(input_tensors_path)
+    output_tensors = read_csv_data(output_tensors_path)
+    
+    batch_size = [1,2,4,8,16,32,64,128,256,512,1024,2048]   
+    
+    with tf.device('/gpu:0'):
+        graph = load_graph(model_path)
+        with graph.as_default():
+            with tf.Session(graph=graph) as sess:
+                for bs in batch_size:
+                    if random_flag:
+                        print("random input data")
+                        input_datasets, _ = generate_graph_datas(graph, input_tensors, bs)
+                        _, output_names = generate_graph_datas(graph, output_tensors, bs)
+
+                        input_values = sess.run(list(input_datasets.values()))
+                        feed_dict = dict(zip(input_datasets.keys(), input_values))                             
+                    else:
+                        input_datasets_path = os.path.join(model_dir, f"{model}/dataset/input_tensor_datas_{bs}.json")
+                        output_datasets_path = os.path.join(model_dir, f"{model}/dataset/output_tensor_datas_{bs}.json")
+                        
+                        print(f"Load input data from json file: {input_datasets_path}")
+                        
+                        input_datasets = load_datasets(input_datasets_path)
+                        output_datasets = load_datasets(output_datasets_path)
+                        input_tensors_dict = {graph.get_tensor_by_name(k): tf.convert_to_tensor(v, dtype=graph.get_tensor_by_name(k).dtype) for k, v in input_datasets.items()}
+                        output_names = {k: graph.get_tensor_by_name(k) for k in output_datasets.keys()}
+                        
+                        input_values = sess.run(list(input_tensors_dict.values()))
+                        feed_dict = dict(zip(input_tensors_dict.keys(), input_values))
+                    for i in range(130):
+                        if i < 30:
+                            times = time.time()
+                        sess.run(output_names, feed_dict=feed_dict)
+                    QPS = 100/(time.time() - times) * bs
+                    logging.info(f"*******batch_size: {bs}  *******QPS: {QPS}")
+                    print(f"*******batch_size: {bs}  *******QPS: {QPS}")
+
+
+if __name__ == '__main__':
+    
+    main()
+            
\ No newline at end of file
--- a/bytedance/src/trtexec.sh
+++ b/bytedance/src/trtexec.sh
+#!/bin/bash
+
+model_path=/datav/sunzhq/workspace/bytedance/new_models/model_1/trt/
+export CUDA_VISIBLE_DEVICES=0
+for bs in 1 2 4 8 16 32 64 128 256 512 1024 2048; do
+    echo "====== batch size: ${bs} ======"
+    trtexec --loadEngine=${model_path}model-static-batch-size-${bs}.trt --useSpinWait > ${model_path}/logs/nvidia-trtexec-${bs}.log
+done
--- a/bytedance/utils/common.py
+++ b/bytedance/utils/common.py
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import os
+import ctypes
+from typing import Optional, List
+
+import numpy as np
+import tensorrt as trt
+from cuda import cuda, cudart
+
+try:
+    # Sometimes python does not understand FileNotFoundError
+    FileNotFoundError
+except NameError:
+    FileNotFoundError = IOError
+
+EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+def check_cuda_err(err):
+    if isinstance(err, cuda.CUresult):
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("Cuda Error: {}".format(err))
+    if isinstance(err, cudart.cudaError_t):
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise RuntimeError("Cuda Runtime Error: {}".format(err))
+    else:
+        raise RuntimeError("Unknown error type: {}".format(err))
+
+def cuda_call(call):
+    err, res = call[0], call[1:]
+    check_cuda_err(err)
+    if len(res) == 1:
+        res = res[0]
+    return res
+
+def GiB(val):
+    return val * 1 << 30
+
+
+def add_help(description):
+    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    args, _ = parser.parse_known_args()
+
+
+def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
+    """
+    Parses sample arguments.
+
+    Args:
+        description (str): Description of the sample.
+        subfolder (str): The subfolder containing data relevant to this sample
+        find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
+
+    Returns:
+        str: Path of data directory.
+    """
+
+    # Standard command-line arguments for all samples.
+    kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
+    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "-d",
+        "--datadir",
+        help="Location of the TensorRT sample data directory, and any additional data directories.",
+        action="append",
+        default=[kDEFAULT_DATA_ROOT],
+    )
+    args, _ = parser.parse_known_args()
+
+    def get_data_path(data_dir):
+        # If the subfolder exists, append it to the path, otherwise use the provided path as-is.
+        data_path = os.path.join(data_dir, subfolder)
+        if not os.path.exists(data_path):
+            if data_dir != kDEFAULT_DATA_ROOT:
+                print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
+            data_path = data_dir
+        # Make sure data directory exists.
+        if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
+            print(
+                "WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(
+                    data_path
+                )
+            )
+        return data_path
+
+    data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
+    return data_paths, locate_files(data_paths, find_files, err_msg)
+
+
+def locate_files(data_paths, filenames, err_msg=""):
+    """
+    Locates the specified files in the specified data directories.
+    If a file exists in multiple data directories, the first directory is used.
+
+    Args:
+        data_paths (List[str]): The data directories.
+        filename (List[str]): The names of the files to find.
+
+    Returns:
+        List[str]: The absolute paths of the files.
+
+    Raises:
+        FileNotFoundError if a file could not be located.
+    """
+    found_files = [None] * len(filenames)
+    for data_path in data_paths:
+        # Find all requested files.
+        for index, (found, filename) in enumerate(zip(found_files, filenames)):
+            if not found:
+                file_path = os.path.abspath(os.path.join(data_path, filename))
+                if os.path.exists(file_path):
+                    found_files[index] = file_path
+
+    # Check that all files were found
+    for f, filename in zip(found_files, filenames):
+        if not f or not os.path.exists(f):
+            raise FileNotFoundError(
+                "Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg)
+            )
+    return found_files
+
+
+class HostDeviceMem:
+    """Pair of host and device memory, where the host memory is wrapped in a numpy array"""
+    def __init__(self, size: int, dtype: np.dtype):
+        nbytes = size * dtype.itemsize
+        host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
+        pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
+
+        self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
+        self._device = cuda_call(cudart.cudaMalloc(nbytes))
+        self._nbytes = nbytes
+
+    @property
+    def host(self) -> np.ndarray:
+        return self._host
+
+    @host.setter
+    def host(self, arr: np.ndarray):
+        if arr.size > self.host.size:
+            raise ValueError(
+                f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
+            )
+        np.copyto(self.host[:arr.size], arr.flat, casting='safe')
+
+    @property
+    def device(self) -> int:
+        return self._device
+
+    @property
+    def nbytes(self) -> int:
+        return self._nbytes
+
+    def __str__(self):
+        return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def free(self):
+        cuda_call(cudart.cudaFree(self.device))
+        cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))
+
+
+# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
+# If engine uses dynamic shapes, specify a profile to find the maximum input & output size.
+def allocate_buffers(engine: trt.ICudaEngine, profile_idx: Optional[int] = None):
+    inputs = []
+    outputs = []
+    bindings = []
+    stream = cuda_call(cudart.cudaStreamCreate())
+    tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
+    for binding in tensor_names:
+        # get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
+        # Pick out the max shape to allocate enough memory for the binding.
+        shape = engine.get_tensor_shape(binding) if profile_idx is None else engine.get_tensor_profile_shape(binding, profile_idx)[-1]
+        shape_valid = np.all([s >= 0 for s in shape])
+        if not shape_valid and profile_idx is None:
+            raise ValueError(f"Binding {binding} has dynamic shape, " +\
+                "but no profile was specified.")
+        size = trt.volume(shape)
+        if engine.has_implicit_batch_dimension:
+            size *= engine.max_batch_size
+        dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))
+
+        # Allocate host and device buffers
+        bindingMemory = HostDeviceMem(size, dtype)
+
+        # Append the device buffer to device bindings.
+        bindings.append(int(bindingMemory.device))
+
+        # Append to the appropriate list.
+        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
+            inputs.append(bindingMemory)
+        else:
+            outputs.append(bindingMemory)
+    return inputs, outputs, bindings, stream
+
+
+# Frees the resources allocated in allocate_buffers
+def free_buffers(inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cudart.cudaStream_t):
+    for mem in inputs + outputs:
+        mem.free()
+    cuda_call(cudart.cudaStreamDestroy(stream))
+
+
+# Wrapper for cudaMemcpy which infers copy size and does error checking
+def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
+    # print(f"size: {host_arr.size}, itemsize: {host_arr.itemsize}")
+    nbytes = host_arr.size * host_arr.itemsize
+    cuda_call(cudart.cudaMemcpy(device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice))
+
+
+# Wrapper for cudaMemcpy which infers copy size and does error checking
+def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
+    nbytes = host_arr.size * host_arr.itemsize
+    cuda_call(cudart.cudaMemcpy(host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
+
+
+def _do_inference_base(inputs, outputs, stream, execute_async):
+    # Transfer input data to the GPU.
+    kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+    [cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
+    # Run inference.
+    execute_async()
+    # Transfer predictions back from the GPU.
+    kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+    [cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]
+    # Synchronize the stream
+    cuda_call(cudart.cudaStreamSynchronize(stream))
+    # Return only the host outputs.
+    return [out.host for out in outputs]
+
+
+# This function is generalized for multiple inputs/outputs.
+# inputs and outputs are expected to be lists of HostDeviceMem objects.
+def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
+    def execute_async():
+        context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream)
+    return _do_inference_base(inputs, outputs, stream, execute_async)
+
+
+# This function is generalized for multiple inputs/outputs for full dimension networks.
+# inputs and outputs are expected to be lists of HostDeviceMem objects.
+def do_inference_v2(context, bindings, inputs, outputs, stream):
+    def execute_async():
+        context.execute_async_v2(bindings=bindings, stream_handle=stream)
+    return _do_inference_base(inputs, outputs, stream, execute_async)
--- a/bytedance/utils/convert_onnx_dynamic_to_static.py
+++ b/bytedance/utils/convert_onnx_dynamic_to_static.py
+import onnx
+from onnxsim import simplify
+import os
+
+
+def convert_onnx_dynamic_to_static(onnx_model_path, output_path, batch_size):
+    model = onnx.load(onnx_model_path)
+    for input in model.graph.input:
+        if input.type.tensor_type.HasField('shape'):
+            for dim in input.type.tensor_type.shape.dim:
+                if dim.dim_value == 0:  # 动态维度通常是 0
+                    dim.dim_value = batch_size
+    model_simp, check = simplify(model)
+    assert check, "Simplified ONNX model could not be validated"
+    
+    # 保存简化后的模型
+    onnx.save(model_simp, output_path)
+    print(f"Simplified and static shape model saved to {output_path}")
+    
+
+if __name__ == '__main__':
+    
+    model = "model_1"
+    model_dir = "./models"
+    onnx_model_path = os.path.join(model_dir, f'{model}/onnx-1/model.onnx')
+    
+    
+    batch_size = [1,2,4,8,16,32,64,128,256,512,1024,2048]
+    
+    for bs in batch_size:
+        static_output_path = os.path.join(model_dir, f'{model}/onnx-1/model-static-batch-size-{bs}.onnx')       
+        convert_onnx_dynamic_to_static(onnx_model_path, static_output_path, bs)
\ No newline at end of file
--- a/bytedance/utils/convert_pb_onnx.py
+++ b/bytedance/utils/convert_pb_onnx.py
+import tensorflow.compat.v1 as tf
+import csv
+import tf2onnx
+import os
+
+def create_directory(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+        print(f"Directory '{path}' created.")
+    else:
+        print(f"Directory '{path}' already exists.")
+
+
+def read_csv_data(file_path):
+    with open(file_path, 'r') as f:
+        reader = csv.reader(f)
+        next(reader)
+        datas = list(reader)
+    for data in datas:
+        data[2] = data[2][1:-1].split(",")
+    
+    return datas
+
+def load_graph(model_file):
+    with tf.gfile.GFile(model_file, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name="")
+    
+    return graph
+
+def convert_graph_to_onnx(graph, input_tensors, output_tensors, output_path):
+    
+    input_graph_names_list = []
+    output_graph_names_list = []
+    
+    with graph.as_default():
+        for output_tensor in output_tensors:
+            output_graph_names_list.append(output_tensor[1])
+        for input_tensor in input_tensors:
+            input_graph_names_list.append(input_tensor[1])
+        
+        with tf.Session(graph=graph) as sess:
+            onnx_graph = tf2onnx.tfonnx.process_tf_graph(sess.graph, input_names=input_graph_names_list, output_names=output_graph_names_list)
+            model_proto = onnx_graph.make_model("test_model")
+            with open(output_path, "wb") as f:
+                f.write(model_proto.SerializeToString())
+                print(f"ONNX model saved to {output_path}")
+                
+ 
+if __name__ == '__main__':
+    
+    model = "model_1"
+    model_dir = "./models"
+    input_tensors_path = os.path.join(model_dir, f"{model}/input_tensors.csv")
+    output_tensors_path = os.path.join(model_dir, f"{model}/output_tensors.csv")
+    model_path = os.path.join(model_dir, f"{model}/model.pb")
+    
+    onnx_model_dir = os.path.join(model_dir, f"{model}/onnx-1")
+    
+    onnx_model_path = os.path.join(onnx_model_dir, "model.onnx")
+    
+    create_directory(onnx_model_dir)
+    
+    input_tensors = read_csv_data(input_tensors_path)
+    output_tensors = read_csv_data(output_tensors_path)
+    graph = load_graph(model_path)
+    convert_graph_to_onnx(graph, input_tensors, output_tensors, onnx_model_path)
+    
\ No newline at end of file
--- a/bytedance/utils/generate_data.py
+++ b/bytedance/utils/generate_data.py
+import tensorflow.compat.v1 as tf
+from copy import deepcopy
+import csv
+import json
+import os
+
+
+DTYPE = {
+    'float32': tf.float32, 
+    'int32': tf.int32, 
+    'int64': tf.int64,
+    'string': tf.string
+}
+
+
+def create_directory(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+        print(f"Directory '{path}' created.")
+    else:
+        print(f"Directory '{path}' already exists.")
+
+
+def convert_nested_array_dtype(arr):
+
+    converted_arr = []
+    for element in arr:
+        if isinstance(element, list):
+            # 如果元素是一个列表，则递归调用自身
+            converted_arr.append(convert_nested_array_dtype(element))
+        else:
+            # 否则保持原样
+            converted_arr.append(element.decode('utf-8'))
+    return converted_arr
+           
+
+def generate_datas(tensors, batch_size):
+    graph_datas = {}
+    for tensor in tensors:
+        _dtype = DTYPE[tensor[-1]]
+        shapes = deepcopy(tensor[2])
+        if shapes == ['']:
+            graph_datas[(tensor[1])] = tf.constant(3.14, dtype=_dtype)
+            continue
+        for i in range(len(shapes)):
+            if shapes[i] == "None":
+                shapes[i] = batch_size
+            if shapes[i] != "":
+                shapes[i] = int(shapes[i])
+        
+        shapes = tuple([i for i in shapes])
+        
+        
+        if tensor[-1] == "int32":
+            random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
+
+        elif tensor[-1] == "int64":
+            random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
+    
+        elif tensor[-1] == "string":
+            # 生成字符串张量
+            batch_size = shapes[0]
+            sequence_length = shapes[1] if len(shapes) > 1 else 1
+            random_tensor = tf.constant([["example_string"] * sequence_length] * batch_size, dtype=tf.string)
+        else:
+            random_tensor = tf.random.normal(shape=shapes, mean=0.0, stddev=1.0, dtype=_dtype)
+
+        
+        graph_datas[(tensor[1])] = random_tensor
+
+    return graph_datas
+
+
+def read_csv_data(file_path):
+    with open(file_path, 'r') as f:
+        reader = csv.reader(f)
+        next(reader)
+        datas = list(reader)
+    for data in datas:
+        data[2] = data[2][1:-1].split(",")
+    
+    return datas
+
+
+def save_graph_datasets_json(input_tensors, output_tensors, batch_size, input_data_json_path, output_data_json_path):
+
+    input_graph_datas = generate_datas(input_tensors, batch_size)
+    output_graph_datas = generate_datas(output_tensors, batch_size)
+    
+    feed_dict = {}
+    output_dict = {}
+    # feed_dict = {key: value.numpy().tolist() for key, value in input_graph_datas.items()}
+    # output_dict = {key: value.numpy().tolist() for key, value in output_graph_datas.items()}
+    for key, value in input_graph_datas.items():
+        if value.dtype == tf.string:
+            value = value.numpy().tolist()
+            value = convert_nested_array_dtype(value)
+        else:
+            value = value.numpy().tolist()
+        feed_dict[key] = value
+
+    for key, value in output_graph_datas.items():
+        if value.dtype == tf.string:
+            value = value.numpy().tolist()
+            value = convert_nested_array_dtype(value)
+        else:
+            value = value.numpy().tolist()
+        output_dict[key] = value
+
+    with open(input_data_json_path, 'w') as f:
+        json.dump(feed_dict, f, indent=4)
+    
+    with open(output_data_json_path, 'w') as f:
+        json.dump(output_dict, f, indent=4)
+
+
+if __name__ == '__main__':
+    
+    model = "model_1"         # 测试模型 name
+    model_dir = "./models"    # 模型目录
+    dataset_path = os.path.join(model_dir, f'{model}/dataset')
+   
+    input_tensors_path = os.path.join(model_dir, f'{model}/input_tensors.csv')
+    output_tensors_path = os.path.join(model_dir, f'{model}/output_tensors.csv')  
+    input_tensors = read_csv_data(input_tensors_path)
+    output_tensors = read_csv_data(output_tensors_path)
+    
+    create_directory(dataset_path)
+    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
+        print("batch_size:", batch_size)
+        input_data_json_path = os.path.join(dataset_path, f'input_tensor_datas_{batch_size}.json')
+        output_data_json_path = os.path.join(dataset_path, f'output_tensor_datas_{batch_size}.json')
+        save_graph_datasets_json(input_tensors, output_tensors, batch_size, input_data_json_path, output_data_json_path)
\ No newline at end of file
--- a/bytedance/utils/trt-infer.py
+++ b/bytedance/utils/trt-infer.py
+import numpy as np
+import tensorrt as trt
+import torch
+from cuda import cudart
+import common as common
+# import common as common
+from colored import fg, stylize
+import copy
+import time
+import json
+
+# 随机种子
+def set_random_seed(num: int):
+    np.random.seed(num)
+    # torch.random.manual_seed(num)
+
+def compare_value(pre_numpy: np.array, true_numpy: np.array):
+    assert pre_numpy.shape == true_numpy.shape
+    diff = np.abs(pre_numpy - true_numpy).max()
+    print(f"{pre_numpy[0, 0, 0, :3]} == {true_numpy[0, 0, 0, :3]}")
+    if diff > 1e-5:
+        print(stylize(f"diff: {diff} is_pass: failed", fg("red")))
+    else:
+        print(stylize(f"diff: {diff} is_pass: OK", fg("green")))
+    return diff
+
+
+def load_tensor_from_npy_file(file_name, dir_path):
+    w_path = f"{dir_path}/{file_name}.npy"
+    data = np.load(w_path)
+    return torch.from_numpy(data)
+    
+
+def load_numpy_from_npy_file(file_name, dir_path):
+    w_path = f"{dir_path}/{file_name}.npy"
+    data = np.load(w_path)
+    return data
+
+
+def load_numpy_from_tensor(tensor):
+    return copy.deepcopy(tensor.detach().cpu().numpy())
+
+
+def get_tensor_from_numpy(data):
+    return torch.from_numpy(data)
+
+
+def get_data_type(trt_data_type):
+    if trt.DataType.FLOAT == trt_data_type:
+        return torch.float32, 4
+    if trt.DataType.HALF == trt_data_type:
+        return torch.float16, 2
+    if trt.DataType.INT8 == trt_data_type:
+        return torch.int8, 1
+    if trt.DataType.INT32 == trt_data_type:
+        return torch.int32, 4
+    if trt.DataType.BOOL == trt_data_type:
+        return torch.bool, 1
+    if trt.DataType.UINT8 == trt_data_type:
+        return torch.uint8, 1
+    if trt.DataType.FP8 == trt_data_type:
+        return torch.float8, 1
+    else:
+        return "unknown", 0
+
+
+class trtInfer:
+    def __init__(self, plan_path, batch_size=1):
+        self.init_plugin()
+        with open(plan_path, "rb") as f:
+            buffer = f.read()
+        self.engine = trt.Runtime(self.logger).deserialize_cuda_engine(buffer)
+        self.nIO = self.engine.num_io_tensors
+        self.ITensorName = [self.engine.get_tensor_name(i) for i in range(self.nIO)]
+        self.nInput = [self.engine.get_tensor_mode(self.ITensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.INPUT)
+        self.stream = cudart.cudaStreamCreate()[1]
+        self.context = self.engine.create_execution_context()
+        assert self.context
+        # print(f"self.ITensorName: {self.ITensorName}")
+        # print(f"self.nIO: {self.nIO}")
+        # print(f"self.nInput: {self.nInput}")
+        # Setup I/O bindings
+        self.inputs = []
+        self.outputs = []
+        self.allocations = []
+        self.IOBindings = []
+        for i in range(self.nIO):
+            name = self.ITensorName[i]
+            mode = self.engine.get_tensor_mode(name)
+            dtype = self.engine.get_tensor_dtype(name)
+            shape = self.engine.get_tensor_shape(name)
+            # print(f"name: {name}, shape: {shape}, dtype: {dtype}, mode: {mode}")
+            t_type, size = get_data_type(dtype)
+            for s in shape:
+                if s == -1:
+                    s = 1
+                size *= s
+            # allocation = common.cuda_call(cudart.cudaMalloc(size * batch_size))
+            allocation = common.cuda_call(cudart.cudaMalloc(1024))
+            self.allocations.append(allocation)
+            binding = {
+                "index": i,
+                "name": name,
+                "dtype": t_type,
+                "shape": list(shape),
+                "allocation": allocation,
+            }
+            
+            if trt.TensorIOMode.INPUT == mode:
+                self.batch_size = shape[0]
+                self.inputs.append(binding)
+            else:
+                self.outputs.append(binding)
+        device = torch.device("cuda:0")
+        self.output_buffer = []
+        for shape, dtype in self.output_spec():
+            self.output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
+
+    def init_plugin(self):
+        self.logger = trt.Logger(trt.Logger.ERROR)
+        trt.init_libnvinfer_plugins(self.logger, "")
+    
+    def input_spec(self):
+        """
+        Get the specs for the input tensor of the network. Useful to prepare memory allocations.
+        :return: Two items, the shape of the input tensor and its (numpy) datatype.
+        """
+        specs = []
+        for o in self.inputs:
+            specs.append((o['shape'], o['dtype']))
+        return specs
+
+    def output_spec(self):
+        """
+        Get the specs for the output tensors of the network. Useful to prepare memory allocations.
+        :return: A list with two items per element, the shape and (numpy) datatype of each output tensor.
+        """
+        specs = []
+        for o in self.outputs:
+            specs.append((o['shape'], o['dtype']))
+        return specs
+
+    def set_Bindding(self):
+        self.IOBindings = []
+        self.IOBindings.extend(self.inputs)
+        self.IOBindings.extend(self.outputs)
+        for i, item in enumerate(self.IOBindings):
+            if i < self.nInput:
+                if not self.context.set_input_shape(item["name"], item["shape"]):
+                    return False
+            if not self.context.set_tensor_address(item["name"], item["allocation"]):
+                return False
+        return True
+
+    def set_input(self, binding_buffering):
+        for i, item in enumerate(binding_buffering):
+            if torch.is_tensor(item):
+                self.inputs[i]['shape'] = list(item.shape)
+                self.inputs[i]['allocation'] = item.reshape(-1).data_ptr()
+            else:
+                self.inputs[i]['allocation'] = item
+
+    def set_output(self, binding_buffering):
+        for i, item in enumerate(binding_buffering):
+            self.outputs[i]['shape'] = list(item.shape)
+            self.outputs[i]['allocation'] = item.reshape(-1).data_ptr()
+
+    def release(self):
+        cudart.cudaStreamDestroy(self.stream)
+
+
+class DM_TRT(trtInfer):
+    def __init__(self, plan_path, bs=1):
+        super().__init__(plan_path, bs)
+
+    def __call__(self, x, timesteps, context, control, only_mid_control=False):
+        device = x.device
+
+        timesteps = timesteps.int()
+        input_buffer = []
+        input_buffer.append(x)
+        input_buffer.append(timesteps)
+        input_buffer.append(context)
+        input_buffer.extend(control)
+
+        current_batch = x.shape[0]
+        output_buffer = []
+        for shape, dtype in self.output_spec():
+            shape[0] = current_batch
+            output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
+
+        self.set_input(input_buffer)  # set shape, allocate
+        self.set_output(output_buffer)
+        self.set_Bindding()
+        self.context.execute_async_v3(self.stream)
+        cudart.cudaStreamSynchronize(self.stream)
+        return output_buffer[0]
+
+
+class CM_TRT(trtInfer):
+    def __init__(self, plan_path, bs=1):
+        super().__init__(plan_path, bs)
+
+    def __call__(self, x, hint, timesteps, context, **kwargs):
+        device = x.device
+
+        timesteps = timesteps.int()
+        input_buffer = []
+        input_buffer.append(x)
+        input_buffer.append(hint)
+        input_buffer.append(timesteps)
+        input_buffer.append(context)
+
+        # current_batch = x.shape[0]
+        # output_buffer = []
+        # for shape, dtype in self.output_spec():
+        #     shape[0] = current_batch
+        #     output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
+
+        self.set_input(input_buffer)  # set shape, allocate
+        # self.set_output(self.output_buffer)
+        self.set_Bindding()
+        self.context.execute_async_v3(self.stream)
+        cudart.cudaStreamSynchronize(self.stream)
+
+        # return output_buffer
+        # return self.output_buffer
+        return self.allocations[self.nInput:self.nIO]
+
+
+class CM_DM_FUSE_TRT:
+    def __init__(self, control_path, unet_path):
+        self.control = CM_TRT(control_path)
+        self.unet = DM_TRT(unet_path)
+
+    def __call__(self, x, hint, timesteps, context, **kwargs):
+        device = x.device
+
+        timesteps = timesteps.int()
+        input_buffer = []
+        input_buffer.append(x)
+        input_buffer.append(hint)
+        input_buffer.append(timesteps)
+        input_buffer.append(context)
+
+        self.control.set_input(input_buffer)   # set shape, allocate
+        # self.control.set_output(self.output_buffer)  # 使用 内部开辟好的cudaMemcpy
+
+        input_unet_buffer = []
+        input_unet_buffer.append(self.control.inputs[0]["allocation"])
+        input_unet_buffer.append(self.control.inputs[2]["allocation"])
+        input_unet_buffer.append(self.control.inputs[3]["allocation"])
+        input_unet_buffer.extend(self.control.allocations[self.control.nInput:self.control.nIO])
+
+        current_batch = x.shape[0]
+        output_unet_buffer = []
+        for shape, dtype in self.unet.output_spec():
+            shape[0] = current_batch
+            output_unet_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
+
+        self.unet.set_input(input_unet_buffer)   # set shape, allocate
+        self.unet.set_output(output_unet_buffer)  # 使用 内部开辟好的cudaMemcpy
+        
+        self.control.set_Bindding()
+        self.unet.set_Bindding()
+        self.control.context.execute_async_v3(self.control.stream)
+        self.unet.context.execute_async_v3(self.control.stream)
+        cudart.cudaStreamSynchronize(self.control.stream)
+
+        return output_unet_buffer[0]
+
+
+def memcpy_tensor_to_dev(data, address):
+    a_size = data[0].numel() * data[0].element_size()
+    for i, item in enumerate(data):
+        item_address = item.reshape(-1).data_ptr()
+        # batch x
+        common.cuda_call(cudart.cudaMemcpy(
+            address + i * a_size, item_address, a_size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice))
+
+
+class CM_DM_BATCH_TRT:
+    def __init__(self, control_path, unet_path, batch_size):
+        self.control = CM_TRT(control_path, batch_size)
+        self.unet = DM_TRT(unet_path, batch_size)
+
+    # def __call__(self, x, hint, timesteps, context, **kwargs):
+    #     device = x.device
+
+    #     timesteps = timesteps.int()
+    #     input_buffer = []
+    #     # input_buffer.append(x)
+    #     memcpy_tensor_to_dev([x,x], self.control.inputs[0]["allocation"])
+    #     # input_buffer.append(hint)
+    #     memcpy_tensor_to_dev(hint, self.control.inputs[1]["allocation"])
+    #     # input_buffer.append(timesteps)
+    #     memcpy_tensor_to_dev([timesteps, timesteps], self.control.inputs[2]["allocation"])
+    #     # input_buffer.append(context)
+    #     memcpy_tensor_to_dev(context, self.control.inputs[3]["allocation"])
+
+    #     # self.control.set_input(input_buffer)   # 使用 内部开辟好的cudaMemcpy
+    #     # self.control.set_output(self.output_buffer)  # 使用 内部开辟好的cudaMemcpy
+    #     self.control.set_Bindding()
+
+    #     input_unet_buffer = []
+    #     input_unet_buffer.append(self.control.inputs[0]["allocation"])
+    #     input_unet_buffer.append(self.control.inputs[2]["allocation"])
+    #     input_unet_buffer.append(self.control.inputs[3]["allocation"])
+    #     input_unet_buffer.extend(self.control.allocations[self.control.nInput:self.control.nIO])
+
+    #     # current_batch = x.shape[0]
+    #     current_batch = 2
+    #     output_unet_buffer = []
+    #     for shape, dtype in self.unet.output_spec():
+    #         shape[0] = current_batch
+    #         temp = torch.zeros(shape, dtype=dtype).float().to(device)
+    #         output_unet_buffer.append(temp)
+
+    #     self.unet.set_input(input_unet_buffer)   # set shape, allocate
+    #     self.unet.set_output(output_unet_buffer)  # 使用 内部开辟好的cudaMemcpy
+    #     self.unet.set_Bindding()
+
+    #     self.control.context.execute_async_v3(self.control.stream)
+    #     self.unet.context.execute_async_v3(self.control.stream)
+    #     cudart.cudaStreamSynchronize(self.control.stream)
+
+    #     model_t = output_unet_buffer[0][0]
+    #     model_uncond = output_unet_buffer[0][1]
+    #     model_output = model_uncond + 9 * (model_t - model_uncond)
+
+    #     return model_output
+
+    def __call__(self, x, hint, timesteps, context, **kwargs):
+        device = x.device
+
+        timesteps = timesteps.int()
+        input_buffer = []
+        input_buffer.append(x)
+        # memcpy_tensor_to_dev([x,x], self.control.inputs[0]["allocation"])
+        input_buffer.append(hint)
+        # memcpy_tensor_to_dev(hint, self.control.inputs[1]["allocation"])
+        input_buffer.append(timesteps)
+        # memcpy_tensor_to_dev([timesteps, timesteps], self.control.inputs[2]["allocation"])
+        input_buffer.append(context)
+        # memcpy_tensor_to_dev(context, self.control.inputs[3]["allocation"])
+
+        self.control.set_input(input_buffer)   # 使用 内部开辟好的cudaMemcpy
+        # self.control.set_output(self.output_buffer)  # 使用 内部开辟好的cudaMemcpy
+        self.control.set_Bindding()
+
+        input_unet_buffer = []
+        input_unet_buffer.append(self.control.inputs[0]["allocation"])
+        input_unet_buffer.append(self.control.inputs[2]["allocation"])
+        input_unet_buffer.append(self.control.inputs[3]["allocation"])
+        input_unet_buffer.extend(self.control.allocations[self.control.nInput:self.control.nIO])
+
+        # current_batch = x.shape[0]
+        current_batch = 2
+        output_unet_buffer = []
+        for shape, dtype in self.unet.output_spec():
+            shape[0] = current_batch
+            temp = torch.zeros(shape, dtype=dtype).float().to(device)
+            output_unet_buffer.append(temp)
+
+        self.unet.set_input(input_unet_buffer)   # set shape, allocate
+        self.unet.set_output(output_unet_buffer)  # 使用 内部开辟好的cudaMemcpy
+        self.unet.set_Bindding()
+
+        self.control.context.execute_async_v3(self.control.stream)
+        self.unet.context.execute_async_v3(self.control.stream)
+        cudart.cudaStreamSynchronize(self.control.stream)
+
+        return output_unet_buffer[0]
+
+
+class Decoder_TRT(trtInfer):
+    def __init__(self, plan_path):
+        super().__init__(plan_path)
+
+    def __call__(self, z):
+        device = z.device
+
+        input_buffer = []
+        input_buffer.append(z)
+        current_batch = z.shape[0]
+        output_buffer = []
+        for shape, dtype in self.output_spec():
+            shape[0] = current_batch
+            output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
+
+        self.set_input(input_buffer)  # set shape, allocate
+        self.set_output(output_buffer)
+        self.set_Bindding()
+        self.context.execute_async_v3(self.stream)
+        cudart.cudaStreamSynchronize(self.stream)
+
+        return output_buffer[0]
+
+
+class ClipModelOutputs:
+    def __init__(self, last_hidden_state):
+        self.last_hidden_state = last_hidden_state
+
+
+class CL_TRT(trtInfer):
+    def __init__(self, plan_path):
+        super().__init__(plan_path)
+
+    def __call__(self, input_ids, **kwargs):
+        device = input_ids.device
+        input_ids = input_ids.int()
+
+        input_buffer = []
+        input_buffer.append(input_ids)
+        # intput_id = x.cpu().numpy()
+        # common.memcpy_host_to_device(self.inputs[0]["allocation"], intput_id)
+
+        current_batch = input_ids.shape[0]
+        output_buffer = []
+        for shape, dtype in self.output_spec():
+            shape[0] = current_batch
+            output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
+
+        self.set_input(input_buffer)  # set shape, allocate
+        self.set_output(output_buffer)
+        self.set_Bindding()
+        self.context.execute_async_v3(self.stream)
+        cudart.cudaStreamSynchronize(self.stream)
+
+        # text_embedding = np.zeros((1, 77, 768), dtype=np.float32)
+        # pooler_output = np.zeros((1, 768), dtype=np.float32)
+        # common.memcpy_device_to_host(text_embedding, self.outputs[0]["allocation"])
+        # common.memcpy_device_to_host(pooler_output, self.outputs[1]["allocation"])
+        # print(text_embedding)
+        # print(pooler_output)
+
+        return ClipModelOutputs(*output_buffer)
+        # return None
+        
+        
+class EXP_TRT(trtInfer):
+    def __init__(self, plan_path, batch_size):
+        super().__init__(plan_path, batch_size)
+
+    def __call__(self, input_datas):
+        self.set_input(input_datas)
+        self.set_Bindding()
+        self.context.execute_async_v3(self.stream)
+        cudart.cudaStreamSynchronize(self.stream)
+        return 0
+
+
+if __name__ == "__main__":
+    set_random_seed(2)
+    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
+            
+        input_data_json_path = f'../new_models/model_1/dataset/input_tensor_datas_{batch_size}.json' 
+        with open(input_data_json_path, 'r') as f:
+            input_datas = json.load(f)
+        input_datas = [value for value in input_datas.values()]
+        device = torch.device("cuda:0")
+        model_path = f"../new_models/model_1/trt/model-static-batch-size-{batch_size}.trt"
+        dm_trt = EXP_TRT(model_path, batch_size)
+        specs = dm_trt.input_spec()
+        specs = [spec[-1] for spec in specs]
+        input_datas = [torch.tensor(value, dtype=spec).to(device) for value, spec in zip(input_datas, specs) ]
+        
+        times = time.time()
+        for i in range(1100):
+            if i < 100:
+                times = time.time()
+            dm_trt(input_datas)
+        print(f"*******batch_size: {batch_size} *******QPS: {1000 / (time.time() - times) * batch_size}")
+        time.sleep(10)