update yolov5s_tvm

ffea5b1c · zhangqha · ffea5b1c · ffea5b1c · ffea5b1c · ffea5b1c
Commit ffea5b1c authored Jun 14, 2023 by zhangqha
14 changed files
--- a/Makefile
+++ b/Makefile
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Makefile Example to deploy TVM modules.
+TVM_ROOT=$(shell cd ../..; pwd)
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
+ROCM_ROOT=/opt/dtk
+OPENCV_INCLUDE = $(shell pkg-config --cflags opencv)
+OPENCV_LIBS = $(shell pkg-config --libs opencv)
+
+PKG_CFLAGS = -std=c++17 -O2 -fPIC\
+	-I${TVM_ROOT}/include\
+	-I${DMLC_CORE}/include\
+	-I${ROCM_ROOT}/include\
+	-I${OPENCV_INCLUDE}\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
+	
+PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -pthread\
+	      -L${ROCM_ROOT}/lib -lamdhip64\
+	      -L${OPENCV_LIBS}\
+	      -L${ROCM_ROOT}/miopen/lib -lMIOpen
+	      
+
+
+.PHONY: clean all
+
+all: lib/libtvm_runtime_pack.o lib/yolov5s_deploy
+
+# Build rule for all in one TVM package library
+.PHONY: lib/libtvm_runtime_pack.o
+lib/libtvm_runtime_pack.o: tvm_runtime_pack.cc
+	@mkdir -p $(@D)
+	$(CXX) -c $(PKG_CFLAGS) -o $@  $^ $(PKG_LDFLAGS)
+
+# Deploy using the all in one TVM package library
+.PHONY: lib/yolov5s_deploy
+lib/yolov5s_deploy: yolov5s_deploy.cc lib/libtvm_runtime_pack.o
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -o $@  $^ $(PKG_LDFLAGS)
+
+clean:
+	rm -rf lib
--- a/README.md
+++ b/README.md
+# YOLOV5S(You Only Look Once version 5 small)
+## 模型介绍
+
+YOLOv5s是一种目标检测模型，是YOLOv5系列中的一个较小版本。它是由Ultralytics公司开发的，使用PyTorch框架实现,相比于之前的YOLOv3和YOLOv4，YOLOv5s在速度和精度上都有了显著的提高，同时模型大小也有所减小。这使得YOLOv5s成为了一种非常优秀的目标检测模型，能够在较短的时间内处理大量的目标检测任务。
+
+## 模型结构
+
+YOLOv5s的模型结构主要由以下几个部分组成：
+
+Backbone：骨干网络采用了CSPDarknet53结构，其中CSP指的是Cross Stage Partial连接，能够提高模型的效率和准确率。
+
+Neck：采用了SPP（Spatial Pyramid Pooling）和PAN（Path Aggregation Network）来增强模型的感受野，并提高对目标的检测能力。
+
+Head：输出层包含三个不同大小的检测头，用于检测不同大小的目标。每个检测头都会输出特定大小的锚框和类别概率，然后进行筛选和调整，最终输出检测结果。
+
+Training Strategy：采用了一种新的训练策略，称为Mosaic数据增强，通过将多张图片随机拼接来进行训练，从而提高模型的鲁棒性和泛化能力。
+
+## 模型文件 
+
+根据实际需要下载yolov5s.onnx文件
+
+## 数据集
+
+示例中验证的数据集来自：
+```
+https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128.zip
+```
+
+## 推理 
+
+### 环境配置
+提供[光源](https://www.sourcefind.cn/#/service-details)拉取的训练的docker镜像：
+
+* 推理镜像：
+```
+  
+```
+* 激活镜像环境及运行测试
+```
+  cd /tvm-0.11-dev0/apps/howto_deploy.yolov5s 
+ 
+```
+### 单卡测试
+
+CPP Deploy测试参考：
+```
+    bash run_example.sh
+```
+Python Deploy测试参考：
+```
+    python yolov5s_infer.py
+```
+## 准确率数据
+
+参考result.jpg
+
+## 源码仓库及问题反馈
+
+* https://developer.hpccube.com/codes/modelzoo/yolov5s_tvm 
+
+## 参考
+
+* https://developer.hpccube.com/codes/modelzoo/yolov5s_tvm 
+
--- a/cow.jpg
+++ b/cow.jpg
--- a/lib/libtvm_runtime_pack.o
+++ b/lib/libtvm_runtime_pack.o
--- a/lib/yolov5s_deploy
+++ b/lib/yolov5s_deploy
--- a/lib/yolov5s_miopen_rocblas.so
+++ b/lib/yolov5s_miopen_rocblas.so
--- a/model.properties
+++ b/model.properties
+# 模型名称
+modelName=YOLOV5S_TVM
+# 模型描述
+modelDescription=YOLOv5s是一种目标检测模型，是YOLOv5系列中的一个较小版本,基于TVM进行优化
+# 应用场景(多个标签以英文逗号分割)
+appScenario=目标检测,位置信息
+# 框架类型(多个标签以英文逗号分割)
+frameType=TVM
+
--- a/prepare_test_libs.py
+++ b/prepare_test_libs.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Script to prepare test_addone.so"""
+import tvm
+import numpy as np
+from tvm import te
+from tvm import relay
+import os
+import onnx
+
+img_data = np.random.rand(1, 3, 640, 640).astype("float32")/255
+input_name = "images"
+shape_dict = {input_name:img_data.shape}
+input_shape = img_data.shape
+print("input shape",img_data.shape)
+model_path = "/yolov5s.onnx"
+onnx_model = onnx.load(model_path)
+np.random.seed(0)
+dtype = "float32"
+
+
+def prepare_test_libs(base_path):
+    n = te.var("n")
+    A = te.placeholder((n,), name="A")
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+    s = te.create_schedule(B.op)
+    # Compile library as dynamic library
+    fadd_dylib = tvm.build(s, [A, B], "llvm", name="addone")
+    dylib_path = os.path.join(base_path, "test_addone_dll.so")
+    fadd_dylib.export_library(dylib_path)
+
+    # Compile library in system library mode
+    fadd_syslib = tvm.build(s, [A, B], "llvm", name="addonesys")
+    syslib_path = os.path.join(base_path, "test_addone_sys.o")
+    fadd_syslib.save(syslib_path)
+
+
+def prepare_graph_lib(base_path):
+    #x = relay.var("x", shape=(2, 2), dtype="float32")
+    #y = relay.var("y", shape=(2, 2), dtype="float32")
+    #params = {"y": np.ones((2, 2), dtype="float32")}
+    #mod = tvm.IRModule.from_expr(relay.Function([x, y], x + y))
+    # build a module
+    mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype=dtype)
+
+    compiled_lib = relay.build(mod, tvm.target.Target("rocm -libs=miopen,rocblas"), params=params)
+    # export it as a shared library
+    # If you are running cross compilation, you can also consider export
+    # to tar and invoke host compiler later.
+    dylib_path = os.path.join(base_path, "yolov5s_miopen_rocblas.so")
+    compiled_lib.export_library(dylib_path)
+    
+def model_test():
+    from tvm.contrib import graph_executor
+    ctx = tvm.rocm()
+    #compile_lib = tvm.runtime.load_module("lib/vgg16_test_relay_add.so")
+    compile_lib:tvm.runtime.Module = tvm.runtime.load_module("lib/vgg16_test_relay_add.so")
+    module = graph_executor.GraphModule(compile_lib["default"](ctx))
+    module.set_input(onnx_model.graph.input[0].name, img_data)
+    module.run()
+    output = module.get_output(0).asnumpy()
+    print(output.shape)
+
+if __name__ == "__main__":
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    #prepare_test_libs(os.path.join(curr_path, "lib"))
+    prepare_graph_lib(os.path.join(curr_path, "lib"))
+    #model_test()
--- a/result.jpg
+++ b/result.jpg
--- a/run_example.sh
+++ b/run_example.sh
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+echo "Build the libraries.."
+mkdir -p lib
+make
+echo "Run the example"
+export LD_LIBRARY_PATH=../../build:${LD_LIBRARY_PATH}
+export DYLD_LIBRARY_PATH=../../build:${DYLD_LIBRARY_PATH}
+export ROCBLAS_TENSILE_LIBPATH=/opt/dtk-22.10/lib/rocblas/library/
+
+
+
+echo "Run the cpp deployment with all in normal library..."
+lib/yolov5s_deploy
+
+
+
--- a/tvm_runtime_pack.cc
+++ b/tvm_runtime_pack.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief This is an all in one TVM runtime file.
+ *
+ *   You only have to use this file to compile libtvm_runtime to
+ *   include in your project.
+ *
+ *  - Copy this file into your project which depends on tvm runtime.
+ *  - Compile with -std=c++17
+ *  - Add the following include path
+ *     - /path/to/tvm/include/
+ *     - /path/to/tvm/3rdparty/dmlc-core/include/
+ *     - /path/to/tvm/3rdparty/dlpack/include/
+ *   - Add -lpthread -ldl to the linked library.
+ *   - You are good to go.
+ *   - See the Makefile in the same folder for example.
+ *
+ *  The include files here are presented with relative path
+ *  You need to remember to change it to point to the right file.
+ *
+ */
+#define TVM_USE_LIBBACKTRACE 0
+#include "../../src/runtime/c_runtime_api.cc"
+#include "../../src/runtime/container.cc"
+#include "../../src/runtime/cpu_device_api.cc"
+#include "../../src/runtime/file_utils.cc"
+#include "../../src/runtime/library_module.cc"
+#include "../../src/runtime/logging.cc"
+#include "../../src/runtime/module.cc"
+#include "../../src/runtime/ndarray.cc"
+#include "../../src/runtime/object.cc"
+#include "../../src/runtime/registry.cc"
+#include "../../src/runtime/thread_pool.cc"
+#include "../../src/runtime/threading_backend.cc"
+#include "../../src/runtime/workspace_pool.cc"
+#include "../../src/runtime/rocm/rocm_module.cc"
+#include "../../src/runtime/rocm/rocm_device_api.cc"
+
+// NOTE: all the files after this are optional modules
+// that you can include remove, depending on how much feature you use.
+
+// Likely we only need to enable one of the following
+// If you use Module::Load, use dso_module
+// For system packed library, use system_lib_module
+#include "../../src/runtime/dso_library.cc"
+#include "../../src/runtime/system_library.cc"
+#include "../../src/runtime/contrib/miopen/conv_forward.cc"
+#include "../../src/runtime/contrib/miopen/miopen_utils.cc"
+
+// Graph executor
+#include "../../src/runtime/graph_executor/graph_executor.cc"
+#include "../../src/runtime/graph_executor/graph_executor_factory.cc"
+
+// Uncomment the following lines to enable RPC
+// #include "../../src/runtime/rpc/rpc_session.cc"
+// #include "../../src/runtime/rpc/rpc_event_impl.cc"
+// #include "../../src/runtime/rpc/rpc_server_env.cc"
+
+// These macros enables the device API when uncommented.
+//#define TVM_CUDA_RUNTIME 1
+//#define TVM_METAL_RUNTIME 1
+//#define TVM_OPENCL_RUNTIME 1
+
+#define TVM_ROCM_RUNTIME 1
+#define TVM_USE_MIOPEN 1
+#define __HIP_PLATFORM_HCC__ 1
+
+// Uncomment the following lines to enable Metal
+// #include "../../src/runtime/metal/metal_device_api.mm"
+// #include "../../src/runtime/metal/metal_module.mm"
+
+// Uncomment the following lines to enable CUDA
+// #include "../../src/runtime/cuda/cuda_device_api.cc"
+// #include "../../src/runtime/cuda/cuda_module.cc"
+
+// Uncomment the following lines to enable OpenCL
+// #include "../../src/runtime/opencl/opencl_device_api.cc"
+// #include "../../src/runtime/opencl/opencl_module.cc"
--- a/yolov5s_deploy.cc
+++ b/yolov5s_deploy.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Example code on load and run TVM module.s
+ * \file cpp_deploy.cc
+ */
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <cstdio>
+#include <fstream>
+#include </usr/include/opencv2/opencv.hpp>
+#include </usr/include/opencv2/highgui/highgui.hpp>  
+#include </usr/include/opencv2/imgproc/imgproc.hpp> 
+#include <iostream>
+#include <typeinfo>
+#include <algorithm>
+#include<vector>
+#include<algorithm>
+
+
+using namespace cv;
+
+void Verify(tvm::runtime::Module mod, std::string fname) {
+  // Get the function from the module.
+  tvm::runtime::PackedFunc f = mod.GetFunction(fname);
+  ICHECK(f != nullptr);
+  // Allocate the DLPack data structures.
+  //
+  // Note that we use TVM runtime API to allocate the DLTensor in this example.
+  // TVM accept DLPack compatible DLTensors, so function can be invoked
+  // as long as we pass correct pointer to DLTensor array.
+  //
+  // For more information please refer to dlpack.
+  // One thing to notice is that DLPack contains alignment requirement for
+  // the data pointer and TVM takes advantage of that.
+  // If you plan to use your customized data container, please
+  // make sure the DLTensor you pass in meet the alignment requirement.
+  //
+  DLTensor* x;
+  DLTensor* y;
+  int ndim = 1;
+  int dtype_code = kDLFloat;
+  int dtype_bits = 32;
+  int dtype_lanes = 1;
+  int device_type = kDLCPU;
+  int device_id = 0;
+  int64_t shape[1] = {10};
+  TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+  TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+  for (int i = 0; i < shape[0]; ++i) {
+    static_cast<float*>(x->data)[i] = i;
+  }
+  // Invoke the function
+  // PackedFunc is a function that can be invoked via positional argument.
+  // The signature of the function is specified in tvm.build
+  f(x, y);
+  // Print out the output
+  for (int i = 0; i < shape[0]; ++i) {
+    ICHECK_EQ(static_cast<float*>(y->data)[i], i + 1.0f);
+  }
+  LOG(INFO) << "Finish verification...";
+  TVMArrayFree(x);
+  TVMArrayFree(y);
+}
+
+void DeploySingleOp() {
+  // Normally we can directly
+  tvm::runtime::Module mod_dylib = tvm::runtime::Module::LoadFromFile("lib/test_addone_dll.so");
+  LOG(INFO) << "Verify dynamic loading from test_addone_dll.so";
+  Verify(mod_dylib, "addone");
+  // For libraries that are directly packed as system lib and linked together with the app
+  // We can directly use GetSystemLib to get the system wide library.
+  LOG(INFO) << "Verify load function from system lib";
+  tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("runtime.SystemLib"))();
+  Verify(mod_syslib, "addonesys");
+}
+
+void PreProcess(const Mat& image, Mat& image_blob)
+{
+	Mat input;
+	image.copyTo(input);
+
+	std::vector<Mat> channels, channel_p;
+	split(input, channels);
+	Mat R, G, B;
+	B = channels.at(0);
+	G = channels.at(1);
+	R = channels.at(2);
+
+	B = (B / 255. - 0.408) / 0.242;
+	G = (G / 255. - 0.448) / 0.239;
+	R = (R / 255. - 0.471) / 0.234;
+
+	channel_p.push_back(R);
+	channel_p.push_back(G);
+	channel_p.push_back(B);
+
+	Mat outt;
+	merge(channel_p, outt);
+	image_blob = outt;
+}
+
+void Mat_to_CHW(float *img_data, cv::Mat &frame)
+{
+    assert(img_data && !frame.empty());
+    unsigned int volChl = 640 * 640;
+
+    for(int c = 0; c < 3; ++c)
+    {
+        for (unsigned j = 0; j < volChl; ++j)
+            img_data[c*volChl + j] = static_cast<float>(float(frame.data[j * 3 + c])/255.0);
+    }
+
+}
+
+
+typedef struct BoxInfo
+{
+	float x1;
+	float y1;
+	float x2;
+	float y2;
+	float score;
+	int label;
+} BoxInfo;
+
+void nms(vector<BoxInfo>& input_boxes)
+{
+        float nmsThreshold = 0.45;
+	sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; }); 
+	vector<float> vArea(input_boxes.size());  
+	for (int i = 0; i < input_boxes.size(); ++i)
+	{
+		vArea[i] = (input_boxes[i].x2 - input_boxes[i].x1 + 1)* (input_boxes[i].y2 - input_boxes[i].y1 + 1);
+	}
+	
+	vector<bool> isSuppressed(input_boxes.size(), false);  
+	for (int i = 0; i < input_boxes.size(); ++i)
+	{
+		if (isSuppressed[i]) { continue; }
+		for (int j = i + 1; j < input_boxes.size(); ++j)
+		{
+			if (isSuppressed[j]) { continue; }
+			float xx1 = max(input_boxes[i].x1, input_boxes[j].x1);
+			float yy1 = max(input_boxes[i].y1, input_boxes[j].y1);
+			float xx2 = min(input_boxes[i].x2, input_boxes[j].x2);
+			float yy2 = min(input_boxes[i].y2, input_boxes[j].y2);
+			float w = max(0.0f, xx2 - xx1 + 1);
+			float h = max(0.0f, yy2 - yy1 + 1);
+			float inter = w * h;	
+			
+			if(input_boxes[i].label == input_boxes[j].label)  
+			{
+				float ovr = inter / (vArea[i] + vArea[j] - inter);  
+				if (ovr >= nmsThreshold)
+				{
+					isSuppressed[j] = true;
+				}
+			}	
+		}
+	}
+	int idx_t = 0;   
+	input_boxes.erase(remove_if(input_boxes.begin(), input_boxes.end(), [&idx_t, &isSuppressed](const BoxInfo& f) { return isSuppressed[idx_t++]; }), input_boxes.end());
+}
+
+void DeployGraphExecutor() {
+  LOG(INFO) << "Running graph executor...";
+  // load in the libr
+  DLDevice dev{kDLROCM, 0};
+  tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("lib/yolov5s_miopen_rocblas.so");
+  
+  // create the graph executor module
+  using namespace std;
+  tvm::runtime::Module gmod = mod_factory.GetFunction("default")(dev);
+  cout<<"---------------"<<endl;
+
+  tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input");
+  tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output");
+  tvm::runtime::PackedFunc run = gmod.GetFunction("run");
+  cv::Mat image = cv::imread("./cow.jpg");
+  //cv::Mat image = cv::imread("./bear.jpg");
+  cv::Mat in_put;
+  cv::Mat img_in;
+  cv::resize(image, in_put, cv::Size(640, 640));
+  //cv::cvtColor(frame, in_put, cv::COLOR_BGR2RGB);  
+
+  float img_data[640*640*3];
+//  PreProcess(in_put, img_in);
+//  Mat_to_CHW(img_data, img_in);
+  Mat_to_CHW(img_data, in_put);
+  //int input_dtype_code = kDLFloat;
+  //int input_dtype_bits = 32;
+  //int input_dtype_lanes = 1;
+  //DLDataType input_dtype = {input_dtype_code, input_dtype_bits, input_dtype_lanes};
+  // Use the C++ API
+  //tvm::runtime::NDArray x = tvm::runtime::NDArray::Empty({1, 3, 224, 224}, input_dtype, {kDLROCM, 0});
+  //tvm::runtime::NDArray input_data = tvm::runtime::NDArray::Empty({1, 3, 224, 224}, DLDataType{kDLFloat, 32, 1}, dev);
+  //tvm::runtime::NDArray y = tvm::runtime::NDArray::Empty({1, 1000}, DLDataType{kDLFloat, 32, 1}, {kDLROCM,0});
+
+
+  
+  DLTensor* y;
+  int out_ndim = 3;
+  int64_t out_shape[3] = {1, 25200, 85};
+  int dtype_code = kDLFloat;
+  int dtype_bits = 32;
+  int dtype_lanes = 1;
+  int device_type = kDLROCM;
+  int device_id = 0;
+  TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+  
+
+
+  
+  DLTensor* x;
+  int ndim = 4;
+  //int dtype_code = kDLFloat;
+  //int dtype_bits = 32;
+  //int dtype_lanes = 1;
+  //int device_type = kDLROCM;
+  //int device_id = 0;
+  int64_t shape[4] = {1, 3 ,640, 640};
+  TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+  memcpy(x->data,&img_data,3*640*640*sizeof(float));
+  
+
+  //TVMArrayCopyFromBytes(x,&img_data,1*sizeof(float));
+  //DLTensor* x;
+  //TVMArrayAlloc({1, 3, 224, 224}, 4, kDLFloat, 32, 1, kDLROCM, 0, &x);
+ /* 
+  for (int c = 0; c < 3; ++c) {
+    for (int h=0; h<224; ++h){
+      for(int w=0; w<224; ++w){
+        static_cast<float*>(x->data)[w] = 0;
+   }
+   }
+  }
+  */
+  // set the right input
+  set_input("images", x);
+  // run the code
+  run();
+  //get the output
+  
+  get_output(0, y);
+  /*
+  for (int i = 0; i < 1; ++i) {
+    for (int j = 0; j < 1000; ++j) {
+      ICHECK_EQ(static_cast<float*>(y->data)[i * 1 + j], i * 1 + j + 1);
+    }
+  }
+  */
+ 
+ static float result[25200][85] = {0};
+ TVMArrayCopyToBytes(y, result, 25200 * 85 * sizeof(float));
+ int num_proposal = sizeof(result)/sizeof(result[0]); //25200
+ int box_classes = sizeof(result[0])/sizeof(result[0][0]);//85
+ cout<<"num_proposal:"<<num_proposal<<endl;
+ cout<<"box_classes："<<box_classes<<endl;
+ vector<BoxInfo> generate_boxes;  // BoxInfo自定义的结构体
+ float* pdata = result[0];
+ //YOLOv5 detect(pdata);
+ float ratioh=1,ratiow=1;
+ //float ratioh = (float)image.rows / 640, ratiow = (float)image.cols / 640;
+ cout<<"ratioh:"<<ratioh<<"\nratiow:"<<ratiow<<endl;
+ float objThreshold=0.2, confThreshold=0.6;
+ //vector<float> confidences;
+ //vector<Rect> boxes;
+ //vector<int> classIds;
+ float padw=0,padh=0;
+ for(int i=0;i<num_proposal;i++)
+ {
+  int index = i*box_classes;
+  float obj_conf = pdata[index+4];  //置信度分数
+  //cout<<pdata[i]<<endl;
+  //cout<<"obj_conf:"<<obj_conf<<endl;
+  //cout<<"+"<<endl;
+  if(obj_conf>objThreshold)
+  {
+    cout<<"obj_conf"<<obj_conf<<endl;
+    //Mat scores(1, box_classes-5, CV_32FC1, pdata+index + 5);
+    //Point classIdPoint; //定义点
+    int class_idx = 0;
+    float max_class_socre = 0;
+    for (int k = 0; k < 80; ++k)
+	{
+	  if (pdata[k + index + 5] > max_class_socre)
+		{
+		   max_class_socre = pdata[k + index + 5];
+		   class_idx = k;
+		}
+	}
+    //double max_class_socre; // 定义一个double类型的变量保存预测中类别分数最大值
+    //minMaxLoc(scores, 0, &max_class_socre, 0, &classIdPoint);  // 求每类类别分数最大的值和索引
+    //cout<<"max_score"<<max_class_socre<<endl;
+    //max_class_socre *= obj_conf;
+    //cout<<"max_class_socre: "<<max_class_socre<<endl;
+    if (max_class_socre > confThreshold){
+      //const int class_idx = classIdPoint.x;
+      float cx = pdata[index];
+      float cy = pdata[index+1];
+      float w = pdata[index+2];
+      float h = pdata[index+3];
+      float xmin = ((cx - padw - 0.5 * w)*ratiow);  // *ratiow，变回原图尺寸
+      float ymin = ((cy - padh - 0.5 * h)*ratioh);    
+      float xmax = (cx - padw + 0.5 * w)*ratiow;
+      float ymax = (cy - padh + 0.5 * h)*ratioh;
+      generate_boxes.push_back(BoxInfo{ xmin, ymin, xmax, ymax, max_class_socre, class_idx });
+      //confidences.push_back((float)max_class_socre);
+      //boxes.push_back(Rect(left, top, (int)(w*ratiow), (int)(h*ratioh)));  //（x,y,w,h）
+      //classIds.push_back(class_idx);  //     
+      //cout<<"cx:"<<cx<<endl;
+      
+    }
+  }
+ }
+ //vector<int> indices;
+ //float nmsThreshold = 0.1;
+ nms(generate_boxes);
+ cout<<generate_boxes.size()<<endl;
+ for(size_t i=0;i<generate_boxes.size();i++){
+ float xmin = generate_boxes[i].x1;
+ float xmax = generate_boxes[i].x2;
+ float ymin = generate_boxes[i].y1;
+ float ymax = generate_boxes[i].y2;
+ float score = generate_boxes[i].score;
+ int classes = generate_boxes[i].label;
+ rectangle(in_put, Point(xmin, ymin), Point(int(generate_boxes[i].x2), int(generate_boxes[i].y2)), Scalar(0, 0, 255), 2);
+ string label = format("%.2f", generate_boxes[i].score);
+ putText(in_put, label, Point(xmin, ymin - 5), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
+ //imwrite("result.jpg",in_put)
+ cout<<"xmin:"<<xmin<<endl;
+ cout<<"xmax:"<<xmax<<endl;
+ cout<<"ymin:"<<ymin<<endl;
+ cout<<"ymax:"<<ymax<<endl;
+ cout<<"score:"<<score<<endl;
+ cout<<"classes:"<<classes<<endl;
+ }
+ 
+  cout<<"----------"<<endl;
+  imwrite("result.jpg",in_put);
+
+}
+
+int main(void) {
+  //DeploySingleOp();
+  DeployGraphExecutor();
+  return 0;
+}
--- a/yolov5s_infer.py
+++ b/yolov5s_infer.py
+import onnx
+import tvm
+from PIL import Image
+import cv2
+from tvm import relay
+import numpy as np
+from yolov5s_pred_utils import non_max_suppression
+
+# onnx_model = onnx.load('model-zoo/googlenet.onnx')
+onnx_model = onnx.load('./yolov5s.onnx')
+img = Image.open('./cow.jpg').resize((640,640))
+img = np.array(img).transpose((2, 0, 1)).astype('float32')
+img = img/255.0
+x = img[np.newaxis, :]
+
+#img_data = np.random.rand(1,3,224,224).astype("float32")/255
+#target = "rocm"
+# target = "llvm"
+dev = tvm.rocm(0)
+# dev = tvm.cpu(0)
+#target = "rocm -libs=miopen"
+target = "rocm -libs=miopen,rocblas"
+input_name = onnx_model.graph.input[0].name
+print(input_name)
+shape_dict = {input_name:x.shape}
+print('shape_dict', shape_dict)
+
+mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype='float32')
+# with relay.build_config(opt_level=2):
+    # graph, lib, params = relay.build_module.build(mod, target=target, params=params)
+dtype = 'float32'
+from tvm.contrib import graph_runtime
+from tvm.contrib import graph_executor
+
+with tvm.transform.PassContext(opt_level=1):
+	lib = relay.build(mod, target=target, params=params)
+	# executor = relay.build_module.create_executor("graph", mod, dev, target, params).evaluate()
+# output = executor(tvm.nd.array(x.astype(dtype)))
+
+m = graph_executor.GraphModule(lib["default"](dev))
+m.set_input(input_name,tvm.nd.array(x.astype(dtype)))
+m.run()
+
+
+'''
+print('output model files')
+libpath = 'out/googlenet.so'
+lib.export_library(libpath)
+
+graph_json_path = 'out/googlenet.json'
+with open(graph_json_path, 'w')as f:
+	f.write(graph)
+
+params_path = 'out/googlenet.params'
+with open(params_path, 'wb')as f:
+	f.write(relay.save_param_dict(params))
+
+
+load_json = open(graph_json_path).read()
+load_lib = tvm.runtime.load_module(libpath)
+load_params = bytearray(open(params_path, 'rb').read())
+ctx = tvm.rocm()
+module = graph_runtime.create(load_json,load_lib,ctx)
+module.load_params(load_params)
+module.run()
+'''
+# output = module.get_output(0).asnumpy()
+output = m.get_output(0).asnumpy()
+pred = non_max_suppression(output, conf_thres=0.1, iou_thres=0.50, classes=None, agnostic=False, multi_label=False, max_det=1000)
+print(pred)
+print(np.max(output,axis=1))
+print(np.argmax(output,axis=1))
--- a/yolov5s_pred_utils.py
+++ b/yolov5s_pred_utils.py
+import numpy as np
+import logging
+import cv2
+
+
+def xyxy2xywh(x):
+    y = np.copy(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+def nms(bboxes, scores, iou_thresh):
+    """
+
+    :param bboxes: 检测框列表
+    :param scores: 置信度列表
+    :param iou_thresh: IOU阈值
+    :return:
+    """
+
+    x1 = bboxes[:, 0]
+    y1 = bboxes[:, 1]
+    x2 = bboxes[:, 2]
+    y2 = bboxes[:, 3]
+    areas = (y2 - y1) * (x2 - x1)
+
+    # 结果列表
+    result = []
+    index = scores.argsort()[::-1]  # 对检测框按照置信度进行从高到低的排序，并获取索引
+    # 下面的操作为了安全，都是对索引处理
+    while index.size > 0:
+        # 当检测框不为空一直循环
+        i = index[0]
+        result.append(i)  # 将置信度最高的加入结果列表
+
+        # 计算其他边界框与该边界框的IOU
+        x11 = np.maximum(x1[i], x1[index[1:]])
+        y11 = np.maximum(y1[i], y1[index[1:]])
+        x22 = np.minimum(x2[i], x2[index[1:]])
+        y22 = np.minimum(y2[i], y2[index[1:]])
+        w = np.maximum(0, x22 - x11 + 1)
+        h = np.maximum(0, y22 - y11 + 1)
+        overlaps = w * h
+        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
+        # 只保留满足IOU阈值的索引
+        idx = np.where(ious <= iou_thresh)[0]
+        index = index[idx + 1]  # 处理剩余的边框
+    # bboxes, scores = bboxes[result], scores[result]
+    # return bboxes, scores
+    return result
+
+def non_max_suppression(prediction,
+                        conf_thres=0.25,
+                        iou_thres=0.45,
+                        classes=None,
+                        agnostic=False,
+                        multi_label=False,
+                        max_det=300):
+    max_wh = 7680  # (pixels) maximum box width and height
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+
+    batch_size = prediction.shape[0]
+    class_number = prediction.shape[2]-5  #85-5
+    xc = prediction[..., 4] > conf_thres
+    output = [np.zeros((0,6))] * batch_size
+    box = prediction[xc == True]
+    print("box.shape:",box.shape)
+    print("box:", sorted(box[...,4], reverse=True))
+
+    for xi, x in enumerate(prediction): #对应的元素和索引 xi是索引 x是元素
+        x = x[xc[xi]]
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+        box = xywh2xyxy(x[:, :4])
+        # Detections matrix nx6 (xyxy, conf, cls)
+        conf, j = x[:, 5:].max(1, keepdims=True), x[:, 5:].argmax(1)[:,None] #选出25200个框中，每个框概率最大的类别
+        x = np.concatenate((box, conf, j), 1)[conf.reshape(-1) > conf_thres] #
+    
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
+    
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = nms(boxes, scores, iou_thres)  # NMS
+        if len(i) > max_det:  # limit detections
+            i = i[:max_det]
+        output[xi] = x[i]
+    return output