Commit dd462c30 authored by zhanggezhong's avatar zhanggezhong
Browse files

Initial commit

parents
# TVM
## 模型介绍
使用深度学习编译器TVM对ResNet50网络模型进行推理及调优
## 模型结构
ResNet50网络中包含了49个卷积层、1个全连接层等
## 数据集及模型文件
模型文件下载地址: "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx"
## 推理及自动调优
### 环境配置
拉取镜像:
docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:tvm-0.10_dtk-22.10_py38_centos-7.6
### 执行推理及调优
下载模型文件后执行以下命令进行推理测试及调优测试:
python tune_resnet50-v2.py
## TVM版本
TVM-0.10
## 性能和准确率数据
使用DCUZ100加速卡执行推理,重复推理100次取平均性能
| 卡数 | batch size | 类型 | 性能 | 是否使用MIOpen | 是否使用tune |
| :------: | :------: | :------: | :------: |:------: | :------:|
| 1 | 1 | FP32 | 195 examples/second| 是 |否|
| 1 | 1 | FP32 | 177.83 examples/second | 否 | 否 |
## 参考
* [https://tvm.apache.org/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py]()
This diff is collapsed.
from tvm import testing
import onnx
testing.utils.install_request_hook(depth=3)
# sphinx_gallery_end_ignore
from PIL import Image
import numpy as np
from scipy.special import softmax
import tvm
from tvm import relay, auto_scheduler
import tvm.relay.testing
from tvm.contrib import graph_executor
import cv2
def get_network(name, batch_size, layout="NCHW", dtype="float32"):
# auto-scheduler prefers NHWC layout
#根据实际情况修改输入维度
if layout == "NHWC":
image_shape = (224, 224, 3)
elif layout == "NCHW":
image_shape = (3, 224, 224)
else:
raise ValueError("Invalid layout: " + layout)
input_shape = (batch_size,) + image_shape
output_shape = (batch_size, 1000)
if name == "resnet50-v2-7":
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype=dtype)
return mod, params, input_shape, output_shape
#数据格式转换目前支持NCHW及NHWC两种格式
'''
model_url = (
"https://github.com/onnx/models/raw/main/"
"vision/classification/resnet/model/"
"resnet50-v2-7.onnx"
)
'''
model_path = "/resnet50-v2-7.onnx"
onnx_model = onnx.load(model_path)
np.random.seed(0)
def readimage(pathOfImage,GRAY=False,inputShape=[1,3,128,128]):
if GRAY==True:
srcImage = cv2.imread(pathOfImage, cv2.IMREAD_GRAYSCALE)
print("srcImage.shape:",srcImage.shape)
resizedImage = cv2.resize(srcImage,(inputShape[3], inputShape[2]))
resizedImage_Float = resizedImage.astype("float32")
srcImage_CHW = resizedImage_Float[None]
else :
srcImage = cv2.imread(pathOfImage, cv2.IMREAD_COLOR) # numpy类型,HWC
# resize并转换为CHW
resizedImage = cv2.resize(srcImage,(inputShape[3], inputShape[2]))
resizedImage_Float = resizedImage.astype("float32") # 转换为float32
srcImage_CHW = np.transpose(resizedImage_Float, (2, 0, 1)) # 转换为CHW
# 预处理
mean_vec = np.array([0.485, 0.456, 0.406])
stddev_vec = np.array([0.229, 0.224, 0.225])
inputData = np.zeros(inputShape).astype("float32") # NCHW
for i in range(srcImage_CHW.shape[0]):
inputData[0,i, :, :] = (srcImage_CHW[i,:,:]/255 - mean_vec[i]) / stddev_vec[i]
# 复制到batch中的其他图像
for i in range(inputData.shape[0]):
if i!=0:
inputData[i,:, :, :]=inputData[0,:, :, :]
return inputData
#Download the image data, then convert it to a numpy array to use as an input to the model.
#img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
img_path = "/kittens.jpg"
#img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
network = "resnet50-v2-7"
dtype = "float32"
target = "rocm"
#target = "rocm -libs=miopen,rocblas"#执行推理时可使用miopen作为对比
input_name = "data"
input_shape=[1,3,224,224]
img_data=readimage(img_path,GRAY=False,inputShape=input_shape)
batch_size = 1
layout = "NCHW"
shape_dict = {input_name: img_data.shape}
input_shape = img_data.shape
print("input shape",img_data.shape)
mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
print("Compile...")
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target=target, params=params)
print("Compile successed !")
dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))
module.set_input(input_name, img_data)
module.run()
output_shape = (1, 1000)
tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
# Download a list of labels
#labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
labels_path = "synset.txt"
#labels_path = download_testdata(labels_url, "synset.txt", module="data")
with open(labels_path, "r") as f:
labels = [l.rstrip() for l in f]
# Open the output and read the output tensor
scores = softmax(tvm_output)
scores = np.squeeze(scores)
ranks = np.argsort(scores)[::-1]
print('class=%s ; probability=%f' %(labels[ranks[0]],scores[ranks[0]]))
# Evaluate
print("Evaluate inference time cost...")
print(module.benchmark(dev, repeat=100, min_repeat_ms=500))
log_file = "%s-%s-B%d.json" % (network, layout, batch_size)
print("log_file name is {}".format(log_file))
print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
for idx, task in enumerate(tasks):
print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key))
print(task.compute_dag)
# Begin Tuning
def run_tuning():
print("Begin tuning...")
measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=2000, # change this to 20000 to achieve the best performance
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)
tuner.tune(tune_option)
run_tuning()
# Compile with the history best
print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(mod, target=target, params=params)
print("Compile success !")
labels_path = "synset.txt"
#labels_path = download_testdata(labels_url, "synset.txt", module="data")
with open(labels_path, "r") as f:
labels = [l.rstrip() for l in f]
dtype = "float32"
module.set_input(input_name, img_data)
module.run()
output_shape = (1, 1000)
tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
# Open the output and read the output tensor
scores = softmax(tvm_output)
scores = np.squeeze(scores)
ranks = np.argsort(scores)[::-1]
print('class=%s ; probability=%f' %(labels[ranks[0]],scores[ranks[0]]))
# Evaluate
print("Evaluate inference time cost...")
print(module.benchmark(dev, repeat=100, min_repeat_ms=500))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment