Commit fe436c6d authored by root's avatar root
Browse files

add model_3

parent 347dae81
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
onnx==1.17.0
onnxruntime==1.20.1
onnxsim==0.4.36
tf2onnx==1.16.1
numpy==1.24.3
\ No newline at end of file
#!/bin/bash
model_path=/home/workspace/ByteMLPerf/bytedance/models/model_1/mxr1/
export HIP_VISIBLE_DEVICES=0
for bs in 1 2 4 8 16 32 64 128 256 512 1024 2048; do
echo "====== batch size: ${bs} ======"
/opt/dtk/bin/migraphx-driver perf --migraphx ${model_path}/model-static-batch-size-${bs}.mxr > ${model_path}logs/DCU-migraphx-driver-${bs}.log
done
import migraphx
import json
import csv
import numpy as np
import time
from tqdm import tqdm
DTYPE={
"float32": np.float32,
"float64": np.float64,
"int32": np.int32,
"int64": np.int64,
"uint8": np.uint8,
"uint16": np.uint16,
"uint32": np.uint32,
"uint64": np.uint64,
"int8": np.int8,
"int16": np.int16,
}
def read_csv_data(file_path):
with open(file_path, 'r') as f:
reader = csv.reader(f)
next(reader)
datas = list(reader)
for data in datas:
data[2] = data[2][1:-1].split(",")
names_dtype = {data[1]:data[-1] for data in datas}
return names_dtype
def load_datasets(datasets_path):
with open(datasets_path, 'r') as f:
datasets = json.load(f)
return datasets
def AllocateteOutputMemory(model):
outputData={}
for key in model.get_outputs().keys():
outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
return outputData
if __name__ == "__main__":
input_names_dtype = read_csv_data("./new_models/model_1/input_tensors.csv")
complie = False
for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
dataset_path = './new_models/model_1/dataset/input_tensor_datas_'+str(batch_size)+'.json'
model_path = './new_models/model_1/onnx/model-static-batch-size-'+str(batch_size)+'.onnx'
input_datasets = load_datasets(dataset_path)
input_datas = {key: np.array(value).astype(DTYPE[input_names_dtype[key]]) for key, value in input_datasets.items()}
if complie:
model = migraphx.parse_onnx(model_path)
print(f"compile {model_path}")
model.compile(migraphx.get_target("gpu"), offload_copy=False, device_id=0)
print(f"./model_1/onnx/model-static-batch-size-{batch_size}.mxr")
migraphx.save(model, f"./model_1/mxr/model-static-batch-size-{batch_size}.mxr")
else:
print(f"./new_models/model_1/onnx/model-static-batch-size-{batch_size}.mxr")
model = migraphx.load( f"./new_models/model_1/mxr/model-static-batch-size-{batch_size}.mxr")
modelData = AllocateteOutputMemory(model)
for key, _ in input_datas.items():
modelData[key] = migraphx.to_gpu(migraphx.argument(input_datas[key]))
for i in range(1100):
if i < 100:
times = time.time()
model.run(modelData)
print("*******batch_size: ", batch_size, "*******QPS: ", 1000/(time.time() - times)*batch_size)
import migraphx
import os
def main():
onnx_model_dir = "./models/model_1/onnx"
mxr_model_dir = "./models/model_1/mxr2"
for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
model_path = os.path.join(onnx_model_dir, f"model-static-batch-size-{batch_size}.onnx")
model = migraphx.parse_onnx(model_path)
print(f"compile {model_path}")
model.compile(migraphx.get_target("gpu"), offload_copy=False, device_id=0)
migraphx.save(model, os.path.join(mxr_model_dir, f"model-static-batch-size-{batch_size}.mxr"))
if __name__ == "__main__":
main()
\ No newline at end of file
#!/bin/bash
onnx_model_path=/home/workspace/ByteMLPerf/bytedance/models/model_1/onnx/
mxr_model_path=/home/workspace/ByteMLPerf/bytedance/models/model_1/mxr1/
export HIP_VISIBLE_DEVICES=0
for bs in 1 2 4 8 16 32 64 128 256 512 1024 2048; do
echo "====== batch size: ${bs} ======"
/opt/dtk/bin/migraphx-driver compile --binary --output ${mxr_model_path}/model-static-batch-size-${bs}.mxr --onnx ${onnx_model_path}/model-static-batch-size-${bs}.onnx
done
\ No newline at end of file
#!/bin/bash
onnx_model_path=/datav/sunzhq/workspace/bytedance/new_models/model_1/onnx/
trt_model_path=/datav/sunzhq/workspace/bytedance/new_models/model_1/trt/
export CUDA_VISIBLE_DEVICES=0
for bs in 1 2 4 8 16 32 64 128 256 512 1024 2048; do
echo "====== batch size: ${bs} ======"
trtexec --onnx=${onnx_model_path}model-static-batch-size-${bs}.onnx --saveEngine=${trt_model_path}model-static-batch-size-${bs}.trt
done
\ No newline at end of file
import tensorflow.compat.v1 as tf
import csv
from copy import deepcopy
import time
import json
import numpy as np
import string
import random
import logging
import os
from datetime import datetime
# 配置日志记录器
tf.disable_v2_behavior()
TF_XLA_FLAGS="--tf_xla_auto_jit=1"
DTYPE = {
'float32': tf.float32,
'int32': tf.int32,
'int64': tf.int64,
'string': tf.string
}
def create_directory(path):
if not os.path.exists(path):
os.makedirs(path)
print(f"Directory '{path}' created.")
else:
print(f"Directory '{path}' already exists.")
def read_csv_data(file_path):
with open(file_path, 'r') as f:
reader = csv.reader(f)
next(reader)
datas = list(reader)
for data in datas:
data[2] = data[2][1:-1].split(",")
return datas
def load_graph(model_file):
with tf.gfile.GFile(model_file, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, name="")
return graph
def generate_graph_datas(graph, tensors, batch_size):
graph_datas = {}
graph_names = {}
for tensor in tensors:
graph_names[tensor[1].split(':')[0]] = graph.get_tensor_by_name(tensor[1])
_dtype = DTYPE[tensor[-1]]
shapes = deepcopy(tensor[2])
if shapes == ['']:
graph_datas[graph.get_tensor_by_name(tensor[1])] = tf.constant(3.14, dtype=_dtype)
continue
if shapes[0] == "None":
shapes[0] = batch_size
for i in range(len(shapes)):
if shapes[i] == "None":
shapes[i] = batch_size
if shapes[i] != "":
shapes[i] = int(shapes[i])
shapes = tuple([i for i in shapes])
if tensor[-1] == "int32":
random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
elif tensor[-1] == "int64":
random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
elif tensor[-1] == "string":
# 生成字符串张量
batch_size = shapes[0]
sequence_length = shapes[1] if len(shapes) > 1 else 1
random_tensor = tf.constant([["example_string"] * sequence_length] * batch_size, dtype=tf.string)
else:
random_tensor = tf.random.normal(shape=shapes, mean=0.0, stddev=1.0, dtype=_dtype)
graph_datas[graph.get_tensor_by_name(tensor[1])] = random_tensor
return graph_datas, graph_names
def load_datasets(datasets_path):
with open(datasets_path, 'r') as f:
datasets = json.load(f)
return datasets
def main():
model = "model_1"
model_dir = "./models"
random_flag = False
logs_dir = os.path.join(model_dir, f"{model}/logs")
create_directory(logs_dir)
logging.basicConfig(filename=os.path.join(logs_dir, f"TF-{model}-{datetime.now().strftime('%Y%m%d%H%M%S')}.log"),
filemode='a',
format='%(asctime)s - %(levelname)s - %(message)s',
level=logging.INFO)
input_tensors_path = os.path.join(model_dir, f'{model}/input_tensors.csv')
output_tensors_path = os.path.join(model_dir, f'{model}/output_tensors.csv')
model_path = os.path.join(model_dir, f'{model}/model.pb')
input_tensors = read_csv_data(input_tensors_path)
output_tensors = read_csv_data(output_tensors_path)
batch_size = [1,2,4,8,16,32,64,128,256,512,1024,2048]
with tf.device('/gpu:0'):
graph = load_graph(model_path)
with graph.as_default():
with tf.Session(graph=graph) as sess:
for bs in batch_size:
if random_flag:
print("random input data")
input_datasets, _ = generate_graph_datas(graph, input_tensors, bs)
_, output_names = generate_graph_datas(graph, output_tensors, bs)
input_values = sess.run(list(input_datasets.values()))
feed_dict = dict(zip(input_datasets.keys(), input_values))
else:
input_datasets_path = os.path.join(model_dir, f"{model}/dataset/input_tensor_datas_{bs}.json")
output_datasets_path = os.path.join(model_dir, f"{model}/dataset/output_tensor_datas_{bs}.json")
print(f"Load input data from json file: {input_datasets_path}")
input_datasets = load_datasets(input_datasets_path)
output_datasets = load_datasets(output_datasets_path)
input_tensors_dict = {graph.get_tensor_by_name(k): tf.convert_to_tensor(v, dtype=graph.get_tensor_by_name(k).dtype) for k, v in input_datasets.items()}
output_names = {k: graph.get_tensor_by_name(k) for k in output_datasets.keys()}
input_values = sess.run(list(input_tensors_dict.values()))
feed_dict = dict(zip(input_tensors_dict.keys(), input_values))
for i in range(130):
if i < 30:
times = time.time()
sess.run(output_names, feed_dict=feed_dict)
QPS = 100/(time.time() - times) * bs
logging.info(f"*******batch_size: {bs} *******QPS: {QPS}")
print(f"*******batch_size: {bs} *******QPS: {QPS}")
if __name__ == '__main__':
main()
\ No newline at end of file
#!/bin/bash
model_path=/datav/sunzhq/workspace/bytedance/new_models/model_1/trt/
export CUDA_VISIBLE_DEVICES=0
for bs in 1 2 4 8 16 32 64 128 256 512 1024 2048; do
echo "====== batch size: ${bs} ======"
trtexec --loadEngine=${model_path}model-static-batch-size-${bs}.trt --useSpinWait > ${model_path}/logs/nvidia-trtexec-${bs}.log
done
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os
import ctypes
from typing import Optional, List
import numpy as np
import tensorrt as trt
from cuda import cuda, cudart
try:
# Sometimes python does not understand FileNotFoundError
FileNotFoundError
except NameError:
FileNotFoundError = IOError
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def check_cuda_err(err):
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("Cuda Error: {}".format(err))
if isinstance(err, cudart.cudaError_t):
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError("Cuda Runtime Error: {}".format(err))
else:
raise RuntimeError("Unknown error type: {}".format(err))
def cuda_call(call):
err, res = call[0], call[1:]
check_cuda_err(err)
if len(res) == 1:
res = res[0]
return res
def GiB(val):
return val * 1 << 30
def add_help(description):
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
args, _ = parser.parse_known_args()
def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
"""
Parses sample arguments.
Args:
description (str): Description of the sample.
subfolder (str): The subfolder containing data relevant to this sample
find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
Returns:
str: Path of data directory.
"""
# Standard command-line arguments for all samples.
kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"-d",
"--datadir",
help="Location of the TensorRT sample data directory, and any additional data directories.",
action="append",
default=[kDEFAULT_DATA_ROOT],
)
args, _ = parser.parse_known_args()
def get_data_path(data_dir):
# If the subfolder exists, append it to the path, otherwise use the provided path as-is.
data_path = os.path.join(data_dir, subfolder)
if not os.path.exists(data_path):
if data_dir != kDEFAULT_DATA_ROOT:
print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
data_path = data_dir
# Make sure data directory exists.
if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
print(
"WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(
data_path
)
)
return data_path
data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
return data_paths, locate_files(data_paths, find_files, err_msg)
def locate_files(data_paths, filenames, err_msg=""):
"""
Locates the specified files in the specified data directories.
If a file exists in multiple data directories, the first directory is used.
Args:
data_paths (List[str]): The data directories.
filename (List[str]): The names of the files to find.
Returns:
List[str]: The absolute paths of the files.
Raises:
FileNotFoundError if a file could not be located.
"""
found_files = [None] * len(filenames)
for data_path in data_paths:
# Find all requested files.
for index, (found, filename) in enumerate(zip(found_files, filenames)):
if not found:
file_path = os.path.abspath(os.path.join(data_path, filename))
if os.path.exists(file_path):
found_files[index] = file_path
# Check that all files were found
for f, filename in zip(found_files, filenames):
if not f or not os.path.exists(f):
raise FileNotFoundError(
"Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg)
)
return found_files
class HostDeviceMem:
"""Pair of host and device memory, where the host memory is wrapped in a numpy array"""
def __init__(self, size: int, dtype: np.dtype):
nbytes = size * dtype.itemsize
host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
self._device = cuda_call(cudart.cudaMalloc(nbytes))
self._nbytes = nbytes
@property
def host(self) -> np.ndarray:
return self._host
@host.setter
def host(self, arr: np.ndarray):
if arr.size > self.host.size:
raise ValueError(
f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
)
np.copyto(self.host[:arr.size], arr.flat, casting='safe')
@property
def device(self) -> int:
return self._device
@property
def nbytes(self) -> int:
return self._nbytes
def __str__(self):
return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"
def __repr__(self):
return self.__str__()
def free(self):
cuda_call(cudart.cudaFree(self.device))
cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
# If engine uses dynamic shapes, specify a profile to find the maximum input & output size.
def allocate_buffers(engine: trt.ICudaEngine, profile_idx: Optional[int] = None):
inputs = []
outputs = []
bindings = []
stream = cuda_call(cudart.cudaStreamCreate())
tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
for binding in tensor_names:
# get_tensor_profile_shape returns (min_shape, optimal_shape, max_shape)
# Pick out the max shape to allocate enough memory for the binding.
shape = engine.get_tensor_shape(binding) if profile_idx is None else engine.get_tensor_profile_shape(binding, profile_idx)[-1]
shape_valid = np.all([s >= 0 for s in shape])
if not shape_valid and profile_idx is None:
raise ValueError(f"Binding {binding} has dynamic shape, " +\
"but no profile was specified.")
size = trt.volume(shape)
if engine.has_implicit_batch_dimension:
size *= engine.max_batch_size
dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(binding)))
# Allocate host and device buffers
bindingMemory = HostDeviceMem(size, dtype)
# Append the device buffer to device bindings.
bindings.append(int(bindingMemory.device))
# Append to the appropriate list.
if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(bindingMemory)
else:
outputs.append(bindingMemory)
return inputs, outputs, bindings, stream
# Frees the resources allocated in allocate_buffers
def free_buffers(inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cudart.cudaStream_t):
for mem in inputs + outputs:
mem.free()
cuda_call(cudart.cudaStreamDestroy(stream))
# Wrapper for cudaMemcpy which infers copy size and does error checking
def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
# print(f"size: {host_arr.size}, itemsize: {host_arr.itemsize}")
nbytes = host_arr.size * host_arr.itemsize
cuda_call(cudart.cudaMemcpy(device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice))
# Wrapper for cudaMemcpy which infers copy size and does error checking
def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
nbytes = host_arr.size * host_arr.itemsize
cuda_call(cudart.cudaMemcpy(host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
def _do_inference_base(inputs, outputs, stream, execute_async):
# Transfer input data to the GPU.
kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
[cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
# Run inference.
execute_async()
# Transfer predictions back from the GPU.
kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
[cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]
# Synchronize the stream
cuda_call(cudart.cudaStreamSynchronize(stream))
# Return only the host outputs.
return [out.host for out in outputs]
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
def execute_async():
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream)
return _do_inference_base(inputs, outputs, stream, execute_async)
# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
def execute_async():
context.execute_async_v2(bindings=bindings, stream_handle=stream)
return _do_inference_base(inputs, outputs, stream, execute_async)
import onnx
from onnxsim import simplify
import os
def convert_onnx_dynamic_to_static(onnx_model_path, output_path, batch_size):
model = onnx.load(onnx_model_path)
for input in model.graph.input:
if input.type.tensor_type.HasField('shape'):
for dim in input.type.tensor_type.shape.dim:
if dim.dim_value == 0: # 动态维度通常是 0
dim.dim_value = batch_size
model_simp, check = simplify(model)
assert check, "Simplified ONNX model could not be validated"
# 保存简化后的模型
onnx.save(model_simp, output_path)
print(f"Simplified and static shape model saved to {output_path}")
if __name__ == '__main__':
model = "model_1"
model_dir = "./models"
onnx_model_path = os.path.join(model_dir, f'{model}/onnx-1/model.onnx')
batch_size = [1,2,4,8,16,32,64,128,256,512,1024,2048]
for bs in batch_size:
static_output_path = os.path.join(model_dir, f'{model}/onnx-1/model-static-batch-size-{bs}.onnx')
convert_onnx_dynamic_to_static(onnx_model_path, static_output_path, bs)
\ No newline at end of file
import tensorflow.compat.v1 as tf
import csv
import tf2onnx
import os
def create_directory(path):
if not os.path.exists(path):
os.makedirs(path)
print(f"Directory '{path}' created.")
else:
print(f"Directory '{path}' already exists.")
def read_csv_data(file_path):
with open(file_path, 'r') as f:
reader = csv.reader(f)
next(reader)
datas = list(reader)
for data in datas:
data[2] = data[2][1:-1].split(",")
return datas
def load_graph(model_file):
with tf.gfile.GFile(model_file, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, name="")
return graph
def convert_graph_to_onnx(graph, input_tensors, output_tensors, output_path):
input_graph_names_list = []
output_graph_names_list = []
with graph.as_default():
for output_tensor in output_tensors:
output_graph_names_list.append(output_tensor[1])
for input_tensor in input_tensors:
input_graph_names_list.append(input_tensor[1])
with tf.Session(graph=graph) as sess:
onnx_graph = tf2onnx.tfonnx.process_tf_graph(sess.graph, input_names=input_graph_names_list, output_names=output_graph_names_list)
model_proto = onnx_graph.make_model("test_model")
with open(output_path, "wb") as f:
f.write(model_proto.SerializeToString())
print(f"ONNX model saved to {output_path}")
if __name__ == '__main__':
model = "model_1"
model_dir = "./models"
input_tensors_path = os.path.join(model_dir, f"{model}/input_tensors.csv")
output_tensors_path = os.path.join(model_dir, f"{model}/output_tensors.csv")
model_path = os.path.join(model_dir, f"{model}/model.pb")
onnx_model_dir = os.path.join(model_dir, f"{model}/onnx-1")
onnx_model_path = os.path.join(onnx_model_dir, "model.onnx")
create_directory(onnx_model_dir)
input_tensors = read_csv_data(input_tensors_path)
output_tensors = read_csv_data(output_tensors_path)
graph = load_graph(model_path)
convert_graph_to_onnx(graph, input_tensors, output_tensors, onnx_model_path)
\ No newline at end of file
import tensorflow.compat.v1 as tf
from copy import deepcopy
import csv
import json
import os
DTYPE = {
'float32': tf.float32,
'int32': tf.int32,
'int64': tf.int64,
'string': tf.string
}
def create_directory(path):
if not os.path.exists(path):
os.makedirs(path)
print(f"Directory '{path}' created.")
else:
print(f"Directory '{path}' already exists.")
def convert_nested_array_dtype(arr):
converted_arr = []
for element in arr:
if isinstance(element, list):
# 如果元素是一个列表,则递归调用自身
converted_arr.append(convert_nested_array_dtype(element))
else:
# 否则保持原样
converted_arr.append(element.decode('utf-8'))
return converted_arr
def generate_datas(tensors, batch_size):
graph_datas = {}
for tensor in tensors:
_dtype = DTYPE[tensor[-1]]
shapes = deepcopy(tensor[2])
if shapes == ['']:
graph_datas[(tensor[1])] = tf.constant(3.14, dtype=_dtype)
continue
for i in range(len(shapes)):
if shapes[i] == "None":
shapes[i] = batch_size
if shapes[i] != "":
shapes[i] = int(shapes[i])
shapes = tuple([i for i in shapes])
if tensor[-1] == "int32":
random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
elif tensor[-1] == "int64":
random_tensor = tf.random.uniform(shape=shapes, minval=0, maxval=10, dtype=_dtype)
elif tensor[-1] == "string":
# 生成字符串张量
batch_size = shapes[0]
sequence_length = shapes[1] if len(shapes) > 1 else 1
random_tensor = tf.constant([["example_string"] * sequence_length] * batch_size, dtype=tf.string)
else:
random_tensor = tf.random.normal(shape=shapes, mean=0.0, stddev=1.0, dtype=_dtype)
graph_datas[(tensor[1])] = random_tensor
return graph_datas
def read_csv_data(file_path):
with open(file_path, 'r') as f:
reader = csv.reader(f)
next(reader)
datas = list(reader)
for data in datas:
data[2] = data[2][1:-1].split(",")
return datas
def save_graph_datasets_json(input_tensors, output_tensors, batch_size, input_data_json_path, output_data_json_path):
input_graph_datas = generate_datas(input_tensors, batch_size)
output_graph_datas = generate_datas(output_tensors, batch_size)
feed_dict = {}
output_dict = {}
# feed_dict = {key: value.numpy().tolist() for key, value in input_graph_datas.items()}
# output_dict = {key: value.numpy().tolist() for key, value in output_graph_datas.items()}
for key, value in input_graph_datas.items():
if value.dtype == tf.string:
value = value.numpy().tolist()
value = convert_nested_array_dtype(value)
else:
value = value.numpy().tolist()
feed_dict[key] = value
for key, value in output_graph_datas.items():
if value.dtype == tf.string:
value = value.numpy().tolist()
value = convert_nested_array_dtype(value)
else:
value = value.numpy().tolist()
output_dict[key] = value
with open(input_data_json_path, 'w') as f:
json.dump(feed_dict, f, indent=4)
with open(output_data_json_path, 'w') as f:
json.dump(output_dict, f, indent=4)
if __name__ == '__main__':
model = "model_1" # 测试模型 name
model_dir = "./models" # 模型目录
dataset_path = os.path.join(model_dir, f'{model}/dataset')
input_tensors_path = os.path.join(model_dir, f'{model}/input_tensors.csv')
output_tensors_path = os.path.join(model_dir, f'{model}/output_tensors.csv')
input_tensors = read_csv_data(input_tensors_path)
output_tensors = read_csv_data(output_tensors_path)
create_directory(dataset_path)
for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
print("batch_size:", batch_size)
input_data_json_path = os.path.join(dataset_path, f'input_tensor_datas_{batch_size}.json')
output_data_json_path = os.path.join(dataset_path, f'output_tensor_datas_{batch_size}.json')
save_graph_datasets_json(input_tensors, output_tensors, batch_size, input_data_json_path, output_data_json_path)
\ No newline at end of file
import numpy as np
import tensorrt as trt
import torch
from cuda import cudart
import common as common
# import common as common
from colored import fg, stylize
import copy
import time
import json
# 随机种子
def set_random_seed(num: int):
np.random.seed(num)
# torch.random.manual_seed(num)
def compare_value(pre_numpy: np.array, true_numpy: np.array):
assert pre_numpy.shape == true_numpy.shape
diff = np.abs(pre_numpy - true_numpy).max()
print(f"{pre_numpy[0, 0, 0, :3]} == {true_numpy[0, 0, 0, :3]}")
if diff > 1e-5:
print(stylize(f"diff: {diff} is_pass: failed", fg("red")))
else:
print(stylize(f"diff: {diff} is_pass: OK", fg("green")))
return diff
def load_tensor_from_npy_file(file_name, dir_path):
w_path = f"{dir_path}/{file_name}.npy"
data = np.load(w_path)
return torch.from_numpy(data)
def load_numpy_from_npy_file(file_name, dir_path):
w_path = f"{dir_path}/{file_name}.npy"
data = np.load(w_path)
return data
def load_numpy_from_tensor(tensor):
return copy.deepcopy(tensor.detach().cpu().numpy())
def get_tensor_from_numpy(data):
return torch.from_numpy(data)
def get_data_type(trt_data_type):
if trt.DataType.FLOAT == trt_data_type:
return torch.float32, 4
if trt.DataType.HALF == trt_data_type:
return torch.float16, 2
if trt.DataType.INT8 == trt_data_type:
return torch.int8, 1
if trt.DataType.INT32 == trt_data_type:
return torch.int32, 4
if trt.DataType.BOOL == trt_data_type:
return torch.bool, 1
if trt.DataType.UINT8 == trt_data_type:
return torch.uint8, 1
if trt.DataType.FP8 == trt_data_type:
return torch.float8, 1
else:
return "unknown", 0
class trtInfer:
def __init__(self, plan_path, batch_size=1):
self.init_plugin()
with open(plan_path, "rb") as f:
buffer = f.read()
self.engine = trt.Runtime(self.logger).deserialize_cuda_engine(buffer)
self.nIO = self.engine.num_io_tensors
self.ITensorName = [self.engine.get_tensor_name(i) for i in range(self.nIO)]
self.nInput = [self.engine.get_tensor_mode(self.ITensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.INPUT)
self.stream = cudart.cudaStreamCreate()[1]
self.context = self.engine.create_execution_context()
assert self.context
# print(f"self.ITensorName: {self.ITensorName}")
# print(f"self.nIO: {self.nIO}")
# print(f"self.nInput: {self.nInput}")
# Setup I/O bindings
self.inputs = []
self.outputs = []
self.allocations = []
self.IOBindings = []
for i in range(self.nIO):
name = self.ITensorName[i]
mode = self.engine.get_tensor_mode(name)
dtype = self.engine.get_tensor_dtype(name)
shape = self.engine.get_tensor_shape(name)
# print(f"name: {name}, shape: {shape}, dtype: {dtype}, mode: {mode}")
t_type, size = get_data_type(dtype)
for s in shape:
if s == -1:
s = 1
size *= s
# allocation = common.cuda_call(cudart.cudaMalloc(size * batch_size))
allocation = common.cuda_call(cudart.cudaMalloc(1024))
self.allocations.append(allocation)
binding = {
"index": i,
"name": name,
"dtype": t_type,
"shape": list(shape),
"allocation": allocation,
}
if trt.TensorIOMode.INPUT == mode:
self.batch_size = shape[0]
self.inputs.append(binding)
else:
self.outputs.append(binding)
device = torch.device("cuda:0")
self.output_buffer = []
for shape, dtype in self.output_spec():
self.output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
def init_plugin(self):
self.logger = trt.Logger(trt.Logger.ERROR)
trt.init_libnvinfer_plugins(self.logger, "")
def input_spec(self):
"""
Get the specs for the input tensor of the network. Useful to prepare memory allocations.
:return: Two items, the shape of the input tensor and its (numpy) datatype.
"""
specs = []
for o in self.inputs:
specs.append((o['shape'], o['dtype']))
return specs
def output_spec(self):
"""
Get the specs for the output tensors of the network. Useful to prepare memory allocations.
:return: A list with two items per element, the shape and (numpy) datatype of each output tensor.
"""
specs = []
for o in self.outputs:
specs.append((o['shape'], o['dtype']))
return specs
def set_Bindding(self):
self.IOBindings = []
self.IOBindings.extend(self.inputs)
self.IOBindings.extend(self.outputs)
for i, item in enumerate(self.IOBindings):
if i < self.nInput:
if not self.context.set_input_shape(item["name"], item["shape"]):
return False
if not self.context.set_tensor_address(item["name"], item["allocation"]):
return False
return True
def set_input(self, binding_buffering):
for i, item in enumerate(binding_buffering):
if torch.is_tensor(item):
self.inputs[i]['shape'] = list(item.shape)
self.inputs[i]['allocation'] = item.reshape(-1).data_ptr()
else:
self.inputs[i]['allocation'] = item
def set_output(self, binding_buffering):
for i, item in enumerate(binding_buffering):
self.outputs[i]['shape'] = list(item.shape)
self.outputs[i]['allocation'] = item.reshape(-1).data_ptr()
def release(self):
cudart.cudaStreamDestroy(self.stream)
class DM_TRT(trtInfer):
def __init__(self, plan_path, bs=1):
super().__init__(plan_path, bs)
def __call__(self, x, timesteps, context, control, only_mid_control=False):
device = x.device
timesteps = timesteps.int()
input_buffer = []
input_buffer.append(x)
input_buffer.append(timesteps)
input_buffer.append(context)
input_buffer.extend(control)
current_batch = x.shape[0]
output_buffer = []
for shape, dtype in self.output_spec():
shape[0] = current_batch
output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
self.set_input(input_buffer) # set shape, allocate
self.set_output(output_buffer)
self.set_Bindding()
self.context.execute_async_v3(self.stream)
cudart.cudaStreamSynchronize(self.stream)
return output_buffer[0]
class CM_TRT(trtInfer):
def __init__(self, plan_path, bs=1):
super().__init__(plan_path, bs)
def __call__(self, x, hint, timesteps, context, **kwargs):
device = x.device
timesteps = timesteps.int()
input_buffer = []
input_buffer.append(x)
input_buffer.append(hint)
input_buffer.append(timesteps)
input_buffer.append(context)
# current_batch = x.shape[0]
# output_buffer = []
# for shape, dtype in self.output_spec():
# shape[0] = current_batch
# output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
self.set_input(input_buffer) # set shape, allocate
# self.set_output(self.output_buffer)
self.set_Bindding()
self.context.execute_async_v3(self.stream)
cudart.cudaStreamSynchronize(self.stream)
# return output_buffer
# return self.output_buffer
return self.allocations[self.nInput:self.nIO]
class CM_DM_FUSE_TRT:
def __init__(self, control_path, unet_path):
self.control = CM_TRT(control_path)
self.unet = DM_TRT(unet_path)
def __call__(self, x, hint, timesteps, context, **kwargs):
device = x.device
timesteps = timesteps.int()
input_buffer = []
input_buffer.append(x)
input_buffer.append(hint)
input_buffer.append(timesteps)
input_buffer.append(context)
self.control.set_input(input_buffer) # set shape, allocate
# self.control.set_output(self.output_buffer) # 使用 内部开辟好的cudaMemcpy
input_unet_buffer = []
input_unet_buffer.append(self.control.inputs[0]["allocation"])
input_unet_buffer.append(self.control.inputs[2]["allocation"])
input_unet_buffer.append(self.control.inputs[3]["allocation"])
input_unet_buffer.extend(self.control.allocations[self.control.nInput:self.control.nIO])
current_batch = x.shape[0]
output_unet_buffer = []
for shape, dtype in self.unet.output_spec():
shape[0] = current_batch
output_unet_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
self.unet.set_input(input_unet_buffer) # set shape, allocate
self.unet.set_output(output_unet_buffer) # 使用 内部开辟好的cudaMemcpy
self.control.set_Bindding()
self.unet.set_Bindding()
self.control.context.execute_async_v3(self.control.stream)
self.unet.context.execute_async_v3(self.control.stream)
cudart.cudaStreamSynchronize(self.control.stream)
return output_unet_buffer[0]
def memcpy_tensor_to_dev(data, address):
a_size = data[0].numel() * data[0].element_size()
for i, item in enumerate(data):
item_address = item.reshape(-1).data_ptr()
# batch x
common.cuda_call(cudart.cudaMemcpy(
address + i * a_size, item_address, a_size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice))
class CM_DM_BATCH_TRT:
def __init__(self, control_path, unet_path, batch_size):
self.control = CM_TRT(control_path, batch_size)
self.unet = DM_TRT(unet_path, batch_size)
# def __call__(self, x, hint, timesteps, context, **kwargs):
# device = x.device
# timesteps = timesteps.int()
# input_buffer = []
# # input_buffer.append(x)
# memcpy_tensor_to_dev([x,x], self.control.inputs[0]["allocation"])
# # input_buffer.append(hint)
# memcpy_tensor_to_dev(hint, self.control.inputs[1]["allocation"])
# # input_buffer.append(timesteps)
# memcpy_tensor_to_dev([timesteps, timesteps], self.control.inputs[2]["allocation"])
# # input_buffer.append(context)
# memcpy_tensor_to_dev(context, self.control.inputs[3]["allocation"])
# # self.control.set_input(input_buffer) # 使用 内部开辟好的cudaMemcpy
# # self.control.set_output(self.output_buffer) # 使用 内部开辟好的cudaMemcpy
# self.control.set_Bindding()
# input_unet_buffer = []
# input_unet_buffer.append(self.control.inputs[0]["allocation"])
# input_unet_buffer.append(self.control.inputs[2]["allocation"])
# input_unet_buffer.append(self.control.inputs[3]["allocation"])
# input_unet_buffer.extend(self.control.allocations[self.control.nInput:self.control.nIO])
# # current_batch = x.shape[0]
# current_batch = 2
# output_unet_buffer = []
# for shape, dtype in self.unet.output_spec():
# shape[0] = current_batch
# temp = torch.zeros(shape, dtype=dtype).float().to(device)
# output_unet_buffer.append(temp)
# self.unet.set_input(input_unet_buffer) # set shape, allocate
# self.unet.set_output(output_unet_buffer) # 使用 内部开辟好的cudaMemcpy
# self.unet.set_Bindding()
# self.control.context.execute_async_v3(self.control.stream)
# self.unet.context.execute_async_v3(self.control.stream)
# cudart.cudaStreamSynchronize(self.control.stream)
# model_t = output_unet_buffer[0][0]
# model_uncond = output_unet_buffer[0][1]
# model_output = model_uncond + 9 * (model_t - model_uncond)
# return model_output
def __call__(self, x, hint, timesteps, context, **kwargs):
device = x.device
timesteps = timesteps.int()
input_buffer = []
input_buffer.append(x)
# memcpy_tensor_to_dev([x,x], self.control.inputs[0]["allocation"])
input_buffer.append(hint)
# memcpy_tensor_to_dev(hint, self.control.inputs[1]["allocation"])
input_buffer.append(timesteps)
# memcpy_tensor_to_dev([timesteps, timesteps], self.control.inputs[2]["allocation"])
input_buffer.append(context)
# memcpy_tensor_to_dev(context, self.control.inputs[3]["allocation"])
self.control.set_input(input_buffer) # 使用 内部开辟好的cudaMemcpy
# self.control.set_output(self.output_buffer) # 使用 内部开辟好的cudaMemcpy
self.control.set_Bindding()
input_unet_buffer = []
input_unet_buffer.append(self.control.inputs[0]["allocation"])
input_unet_buffer.append(self.control.inputs[2]["allocation"])
input_unet_buffer.append(self.control.inputs[3]["allocation"])
input_unet_buffer.extend(self.control.allocations[self.control.nInput:self.control.nIO])
# current_batch = x.shape[0]
current_batch = 2
output_unet_buffer = []
for shape, dtype in self.unet.output_spec():
shape[0] = current_batch
temp = torch.zeros(shape, dtype=dtype).float().to(device)
output_unet_buffer.append(temp)
self.unet.set_input(input_unet_buffer) # set shape, allocate
self.unet.set_output(output_unet_buffer) # 使用 内部开辟好的cudaMemcpy
self.unet.set_Bindding()
self.control.context.execute_async_v3(self.control.stream)
self.unet.context.execute_async_v3(self.control.stream)
cudart.cudaStreamSynchronize(self.control.stream)
return output_unet_buffer[0]
class Decoder_TRT(trtInfer):
def __init__(self, plan_path):
super().__init__(plan_path)
def __call__(self, z):
device = z.device
input_buffer = []
input_buffer.append(z)
current_batch = z.shape[0]
output_buffer = []
for shape, dtype in self.output_spec():
shape[0] = current_batch
output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
self.set_input(input_buffer) # set shape, allocate
self.set_output(output_buffer)
self.set_Bindding()
self.context.execute_async_v3(self.stream)
cudart.cudaStreamSynchronize(self.stream)
return output_buffer[0]
class ClipModelOutputs:
def __init__(self, last_hidden_state):
self.last_hidden_state = last_hidden_state
class CL_TRT(trtInfer):
def __init__(self, plan_path):
super().__init__(plan_path)
def __call__(self, input_ids, **kwargs):
device = input_ids.device
input_ids = input_ids.int()
input_buffer = []
input_buffer.append(input_ids)
# intput_id = x.cpu().numpy()
# common.memcpy_host_to_device(self.inputs[0]["allocation"], intput_id)
current_batch = input_ids.shape[0]
output_buffer = []
for shape, dtype in self.output_spec():
shape[0] = current_batch
output_buffer.append(torch.zeros(shape, dtype=dtype).float().to(device))
self.set_input(input_buffer) # set shape, allocate
self.set_output(output_buffer)
self.set_Bindding()
self.context.execute_async_v3(self.stream)
cudart.cudaStreamSynchronize(self.stream)
# text_embedding = np.zeros((1, 77, 768), dtype=np.float32)
# pooler_output = np.zeros((1, 768), dtype=np.float32)
# common.memcpy_device_to_host(text_embedding, self.outputs[0]["allocation"])
# common.memcpy_device_to_host(pooler_output, self.outputs[1]["allocation"])
# print(text_embedding)
# print(pooler_output)
return ClipModelOutputs(*output_buffer)
# return None
class EXP_TRT(trtInfer):
def __init__(self, plan_path, batch_size):
super().__init__(plan_path, batch_size)
def __call__(self, input_datas):
self.set_input(input_datas)
self.set_Bindding()
self.context.execute_async_v3(self.stream)
cudart.cudaStreamSynchronize(self.stream)
return 0
if __name__ == "__main__":
set_random_seed(2)
for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
input_data_json_path = f'../new_models/model_1/dataset/input_tensor_datas_{batch_size}.json'
with open(input_data_json_path, 'r') as f:
input_datas = json.load(f)
input_datas = [value for value in input_datas.values()]
device = torch.device("cuda:0")
model_path = f"../new_models/model_1/trt/model-static-batch-size-{batch_size}.trt"
dm_trt = EXP_TRT(model_path, batch_size)
specs = dm_trt.input_spec()
specs = [spec[-1] for spec in specs]
input_datas = [torch.tensor(value, dtype=spec).to(device) for value, spec in zip(input_datas, specs) ]
times = time.time()
for i in range(1100):
if i < 100:
times = time.time()
dm_trt(input_datas)
print(f"*******batch_size: {batch_size} *******QPS: {1000 / (time.time() - times) * batch_size}")
time.sleep(10)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment