Initial commit

Signed-off-by: lijian <lijian6@sugon.com>

Initial commit
Signed-off-by: lijian <lijian6@sugon.com>
9467a6d8 · lijian6 · 9467a6d8 · 9467a6d8 · 9467a6d8 · 9467a6d8
Commit 9467a6d8 authored Jul 03, 2023 by lijian6
5 changed files
--- a/prepare_test_libs.py
+++ b/prepare_test_libs.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Script to prepare test_addone.so"""
+import tvm
+import numpy as np
+from tvm import te
+from tvm import relay
+import os
+import onnx
+
+img_data = np.random.rand(1, 3, 640, 640).astype("float32")/255
+input_name = "images"
+shape_dict = {input_name:img_data.shape}
+input_shape = img_data.shape
+print("input shape",img_data.shape)
+model_path = "./Resource/yolov7-tiny.onnx"
+onnx_model = onnx.load(model_path)
+np.random.seed(0)
+dtype = "float32"
+
+
+def prepare_test_libs(base_path):
+    n = te.var("n")
+    A = te.placeholder((n,), name="A")
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+    s = te.create_schedule(B.op)
+    # Compile library as dynamic library
+    fadd_dylib = tvm.build(s, [A, B], "llvm", name="addone")
+    dylib_path = os.path.join(base_path, "test_addone_dll.so")
+    fadd_dylib.export_library(dylib_path)
+
+    # Compile library in system library mode
+    fadd_syslib = tvm.build(s, [A, B], "llvm", name="addonesys")
+    syslib_path = os.path.join(base_path, "test_addone_sys.o")
+    fadd_syslib.save(syslib_path)
+
+
+def prepare_graph_lib(base_path):
+    #x = relay.var("x", shape=(2, 2), dtype="float32")
+    #y = relay.var("y", shape=(2, 2), dtype="float32")
+    #params = {"y": np.ones((2, 2), dtype="float32")}
+    #mod = tvm.IRModule.from_expr(relay.Function([x, y], x + y))
+    # build a module
+    mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype=dtype)
+
+    compiled_lib = relay.build(mod, tvm.target.Target("rocm -libs=miopen,rocblas"), params=params)
+    # export it as a shared library
+    # If you are running cross compilation, you can also consider export
+    # to tar and invoke host compiler later.
+    dylib_path = os.path.join(base_path, "yolov7t_miopen_rocblas.so")
+    compiled_lib.export_library(dylib_path)
+    
+def model_test():
+    from tvm.contrib import graph_executor
+    ctx = tvm.rocm()
+    #compile_lib = tvm.runtime.load_module("lib/vgg16_test_relay_add.so")
+    compile_lib:tvm.runtime.Module = tvm.runtime.load_module("lib/vgg16_test_relay_add.so")
+    module = graph_executor.GraphModule(compile_lib["default"](ctx))
+    module.set_input(onnx_model.graph.input[0].name, img_data)
+    module.run()
+    output = module.get_output(0).asnumpy()
+    print(output.shape)
+
+if __name__ == "__main__":
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    #prepare_test_libs(os.path.join(curr_path, "lib"))
+    prepare_graph_lib(os.path.join(curr_path, "lib"))
+    #model_test()
--- a/run_example.sh
+++ b/run_example.sh
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if [ $# -ne 1 ]; then
+    echo "Two args are required: $0 index"
+    echo "index:"
+    echo "\t 1) YOLOV3T sample."
+    echo "\t 2) YOLOV5S sample."
+    echo "\t 3) YOLOV7T sample."
+    exit
+fi
+
+echo "Build the libraries.."
+mkdir -p lib
+make
+export LD_LIBRARY_PATH=/tvm-0.11-dev0/build:${LD_LIBRARY_PATH}
+export DYLD_LIBRARY_PATH=/tvm-0.11-dev0//build:${DYLD_LIBRARY_PATH}
+export ROCBLAS_TENSILE_LIBPATH=/opt/dtk/lib/rocblas/library/
+
+echo "Run the example"
+echo "Run the cpp deployment with all in normal library..."
+if [[ $1 == '1' ]]; then
+    echo "Run yolov3-tiny sample"
+    ./yolov3t_deploy
+elif [[ $1 == '2' ]]; then
+    echo "Run yolov5s sample"
+    ./yolov5s_deploy
+elif [[ $1 == '3' ]]; then
+    echo "Run yolov7-tiny sample"
+    ./yolov7t_deploy
+else
+    echo "error index, please set arg to 0 / 1 / 2."
+fi
+
--- a/src/yolov3t_deploy.cc
+++ b/src/yolov3t_deploy.cc
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <cstdio>
+#include <fstream>
+#include </usr/include/opencv2/opencv.hpp>
+#include </usr/include/opencv2/highgui/highgui.hpp>  
+#include </usr/include/opencv2/imgproc/imgproc.hpp> 
+#include <iostream>
+#include <typeinfo>
+#include <algorithm>
+#include <vector>
+#include <algorithm>
+#include <Decoder.h>
+#include <Queuethread.h>
+#include <thread>
+
+using namespace cv;
+using namespace std;
+
+static void DecoderThreadFunc(Queue* queue)
+{
+    int ret, end = 0;
+    int frame_cnt = 0;
+    Queue* que = queue;
+    Decoder decoder;
+    decoder.DecoderInit();
+    while(true)
+    {
+        if (av_read_frame(decoder.fmt_ctx, decoder.pkt) < 0)
+        {
+            if(end == 2)
+            {
+                que->DecodeEnd = true;
+                break;
+            }
+           end = 1;
+        }
+        if (decoder.pkt->stream_index == decoder.video_stream_idx) {
+            if(!end) {
+                ret = avcodec_send_packet(decoder.video_dec_ctx, decoder.pkt);
+            } else {
+                ret = avcodec_send_packet(decoder.video_dec_ctx, NULL);
+            }
+            if (ret < 0 && ret != AVERROR_EOF) {
+                fprintf(stderr, "Error submitting a packet for decoding\n");
+                que->DecodeEnd = true;
+                break;
+            }
+            while (ret >= 0 || end == 1)
+            {
+                ret = avcodec_receive_frame(decoder.video_dec_ctx, decoder.frame);
+                if (ret == AVERROR(EAGAIN)) {
+                    break;
+                } else if (ret == AVERROR_EOF ) {
+                    end = 2;
+                    break;
+                } else if (ret < 0) {
+                    av_log(NULL, AV_LOG_ERROR, "Error while receiving a frame from the decoder\n");
+                    que->finish();
+                    return;
+                }
+                decoder.frame->pts = decoder.frame->best_effort_timestamp;
+                frame_cnt++;
+
+                cv::Mat srcImage = cv::Mat::zeros(decoder.frame->height*3/2, decoder.frame->width, CV_8UC1);
+                memcpy(srcImage.data, (unsigned char*)decoder.frame->data[0], decoder.frame->width * decoder.frame->height);
+                memcpy(srcImage.data + decoder.frame->width * decoder.frame->height, (unsigned char*)decoder.frame->data[1], decoder.frame->width * decoder.frame->height/4);
+                memcpy(srcImage.data + decoder.frame->width * decoder.frame->height*5/4, (unsigned char*)decoder.frame->data[2], decoder.frame->width * decoder.frame->height/4);
+                cvtColor(srcImage, srcImage, COLOR_YUV420p2RGB);
+                que->enQueue(srcImage);
+                av_frame_unref(decoder.frame);
+            }
+        }
+        av_packet_unref(decoder.pkt);
+    }
+    fprintf(stdout, "Decoder: ####### frame count: %d\n", frame_cnt);
+    que->finish();
+}
+
+void Mat_to_CHW(float *img_data, cv::Mat &frame)
+{
+    assert(img_data && !frame.empty());
+    unsigned int volChl = 416 * 416;
+
+    for(int c = 0; c < 3; ++c)
+    {
+        for (long j = 0; j < volChl; ++j)
+            img_data[c*volChl + j] = static_cast<float>(float(frame.data[j * 3 + c])/255.0);
+    }
+}
+
+typedef struct BoxInfo
+{
+    float x1;
+    float y1;
+    float x2;
+    float y2;
+    float score;
+    int label;
+} BoxInfo;
+
+void nms(vector<BoxInfo>& input_boxes)
+{
+    float nmsThreshold = 0.45;
+    sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; }); 
+    vector<float> vArea(input_boxes.size());  
+    for (int i = 0; i < input_boxes.size(); ++i)
+    {
+        vArea[i] = (input_boxes[i].x2 - input_boxes[i].x1 + 1)* (input_boxes[i].y2 - input_boxes[i].y1 + 1);
+    }
+
+    vector<bool> isSuppressed(input_boxes.size(), false);  
+    for (int i = 0; i < input_boxes.size(); ++i)
+    {
+        if (isSuppressed[i]) { continue; }
+        for (int j = i + 1; j < input_boxes.size(); ++j)
+        {
+            if (isSuppressed[j]) { continue; }
+            float xx1 = max(input_boxes[i].x1, input_boxes[j].x1);
+            float yy1 = max(input_boxes[i].y1, input_boxes[j].y1);
+            float xx2 = min(input_boxes[i].x2, input_boxes[j].x2);
+            float yy2 = min(input_boxes[i].y2, input_boxes[j].y2);
+            float w = max(0.0f, xx2 - xx1 + 1);
+            float h = max(0.0f, yy2 - yy1 + 1);
+            float inter = w * h;	
+
+            if(input_boxes[i].label == input_boxes[j].label)  
+            {
+                float ovr = inter / (vArea[i] + vArea[j] - inter);  
+                if (ovr >= nmsThreshold)
+                {
+                    isSuppressed[j] = true;
+                }
+            }	
+        }
+    }
+    int idx_t = 0;   
+    input_boxes.erase(remove_if(input_boxes.begin(), input_boxes.end(), [&idx_t, &isSuppressed](const BoxInfo& f) { return isSuppressed[idx_t++]; }), input_boxes.end());
+}
+
+void DeployGraphExecutor(cv::Mat &srcImage, std::vector<BoxInfo> &generate_boxes)
+{
+    // load in the libr
+    DLDevice dev{kDLROCM, 0};
+    tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("lib/yolov3t_miopen_rocblas.so");
+  
+    // create the graph executor module
+    tvm::runtime::Module gmod = mod_factory.GetFunction("default")(dev);
+
+    tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input");
+    tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output");
+    tvm::runtime::PackedFunc run = gmod.GetFunction("run");
+    cv::Mat in_put;
+    cv::resize(srcImage, in_put, cv::Size(416, 416));
+    float img_data[416*416*3];
+    Mat_to_CHW(img_data, in_put);
+  
+    DLTensor* y;
+    int out_ndim = 3;
+    int64_t out_shape[3] = {1, 2535, 85};
+    int dtype_code = kDLFloat;
+    int dtype_bits = 32;
+    int dtype_lanes = 1;
+    int device_type = kDLROCM;
+    int device_id = 0;
+    TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+
+    DLTensor* x;
+    int ndim = 4;
+    int64_t shape[4] = {1, 3 ,416, 416};
+    TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+    memcpy(x->data,&img_data,3*416*416*sizeof(float));
+
+    // set the right input
+    set_input("images", x);
+    // run the code
+    run();
+    //get the output
+    get_output(0, y);
+
+    static float result[2535][85] = {0};
+    TVMArrayCopyToBytes(y, result, 2535 * 85 * sizeof(float));
+    int num_proposal = sizeof(result)/sizeof(result[0]); //2535
+    int box_classes = sizeof(result[0])/sizeof(result[0][0]);//85
+    float* pdata = result[0];
+    float ratioh = (float)srcImage.rows / 416, ratiow = (float)srcImage.cols / 416;
+    float objThreshold=0.5, confThreshold=0.5;
+    for(int i=0;i<num_proposal;i++)
+    {
+        int index = i*box_classes;
+        float obj_conf = pdata[index+4];  //置信度分数
+        if(obj_conf > objThreshold)
+        {
+           int class_idx = 0;
+           float max_class_socre = 0;
+           for (int k = 0; k < 80; ++k)
+           {
+               if (pdata[k + index + 5] > max_class_socre)
+               {
+                   max_class_socre = pdata[k + index + 5];
+                   class_idx = k;
+               }
+           }
+           if (max_class_socre > confThreshold)
+           {
+               float cx = pdata[index];
+               float cy = pdata[index+1];
+               float w = pdata[index+2];
+               float h = pdata[index+3];
+               float xmin = (cx - 0.5 * w)*ratiow;  // *ratiow，变回原图尺寸
+               float ymin = (cy - 0.5 * h)*ratioh;    
+               float xmax = (cx + 0.5 * w)*ratiow;
+               float ymax = (cy + 0.5 * h)*ratioh;
+               generate_boxes.push_back(BoxInfo{ xmin, ymin, xmax, ymax, max_class_socre, class_idx });
+           }
+        }
+    }
+    nms(generate_boxes);
+}
+
+int main()
+{
+    Queue* queue = new Queue(1);
+    std::thread ThreadDecoder(DecoderThreadFunc, queue);
+
+    int frame_cnt = 0;
+    double start_time = getTickCount();
+    while (!queue->DecodeEnd)
+    {
+        cv::Mat srcImage;
+        queue->deQueue(&srcImage);
+	if (srcImage.empty()) {
+	    continue;
+	}
+
+        std::vector<BoxInfo> generate_boxes;
+        double time1 = getTickCount();
+        DeployGraphExecutor(srcImage, generate_boxes);
+        double time2 = getTickCount();
+	double elapsedTime = (time2 - time1)*1000 / getTickFrequency();
+	fprintf(stdout, "inference time:%f ms\n", elapsedTime);
+	frame_cnt++;
+
+        // postprocess
+        fprintf(stdout,"////////////////Detection Results////////////////\n");
+        for(size_t i=0;i<generate_boxes.size();i++)
+        {
+            BoxInfo result = generate_boxes[i];
+            rectangle(srcImage, Point(result.x1, result.y1), Point(int(result.x2), int(result.y2)), Scalar(0, 0, 255), 2);
+            string score = format("score:%.2f", result.score);
+            string label = format("label:%d", result.label);
+            putText(srcImage, score, Point(result.x1, result.y1 - 5), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
+            putText(srcImage, label, Point(result.x1 + 130, result.y1 - 5), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
+
+            fprintf(stdout, "detector result[%d] box:%.1f %.1f %.1f %.1f, label:%d, confidence:%.2f\n", i,
+            result.x1, result.x2, result.y1, result.y2, result.label, result.score);
+        }
+        // save result
+        /*char out[20];
+        snprintf(out, sizeof(out), "out/Frame%d.jpg", frame_cnt);
+        imwrite(out, srcImage);*/
+    }
+    double end_time = getTickCount();
+    fprintf(stdout, "Finish ####### frame_cnt: %d, Inference fps: %.2f, all time: %.2f ms\n", frame_cnt, float(frame_cnt/((end_time - start_time)/getTickFrequency())), (end_time - start_time)/getTickFrequency()*1000);
+
+    ThreadDecoder.join();
+    delete queue;
+    queue = NULL;
+    return 0;
+}
--- a/src/yolov5s_deploy.cc
+++ b/src/yolov5s_deploy.cc
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <cstdio>
+#include <fstream>
+#include </usr/include/opencv2/opencv.hpp>
+#include </usr/include/opencv2/highgui/highgui.hpp>  
+#include </usr/include/opencv2/imgproc/imgproc.hpp> 
+#include <iostream>
+#include <typeinfo>
+#include <algorithm>
+#include <vector>
+#include <algorithm>
+#include <Decoder.h>
+#include <Queuethread.h>
+#include <thread>
+
+using namespace cv;
+using namespace std;
+
+static void DecoderThreadFunc(Queue* queue)
+{
+    int ret, end = 0;
+    int frame_cnt = 0;
+    Queue* que = queue;
+    Decoder decoder;
+    decoder.DecoderInit();
+    while(true)
+    {
+        if (av_read_frame(decoder.fmt_ctx, decoder.pkt) < 0)
+        {
+            if(end == 2)
+            {
+                que->DecodeEnd = true;
+                break;
+            }
+           end = 1;
+        }
+        if (decoder.pkt->stream_index == decoder.video_stream_idx) {
+            if(!end) {
+                ret = avcodec_send_packet(decoder.video_dec_ctx, decoder.pkt);
+            } else {
+                ret = avcodec_send_packet(decoder.video_dec_ctx, NULL);
+            }
+            if (ret < 0 && ret != AVERROR_EOF) {
+                fprintf(stderr, "Error submitting a packet for decoding\n");
+                que->DecodeEnd = true;
+                break;
+            }
+            while (ret >= 0 || end == 1)
+            {
+                ret = avcodec_receive_frame(decoder.video_dec_ctx, decoder.frame);
+                if (ret == AVERROR(EAGAIN)) {
+                    break;
+                } else if (ret == AVERROR_EOF ) {
+                    end = 2;
+                    break;
+                } else if (ret < 0) {
+                    av_log(NULL, AV_LOG_ERROR, "Error while receiving a frame from the decoder\n");
+                    que->finish();
+                    return;
+                }
+                decoder.frame->pts = decoder.frame->best_effort_timestamp;
+                frame_cnt++;
+
+                cv::Mat srcImage = cv::Mat::zeros(decoder.frame->height*3/2, decoder.frame->width, CV_8UC1);
+                memcpy(srcImage.data, (unsigned char*)decoder.frame->data[0], decoder.frame->width * decoder.frame->height);
+                memcpy(srcImage.data + decoder.frame->width * decoder.frame->height, (unsigned char*)decoder.frame->data[1], decoder.frame->width * decoder.frame->height/4);
+                memcpy(srcImage.data + decoder.frame->width * decoder.frame->height*5/4, (unsigned char*)decoder.frame->data[2], decoder.frame->width * decoder.frame->height/4);
+                cvtColor(srcImage, srcImage, COLOR_YUV420p2RGB);
+                que->enQueue(srcImage);
+                av_frame_unref(decoder.frame);
+            }
+        }
+        av_packet_unref(decoder.pkt);
+    }
+    fprintf(stdout, "Decoder: ####### frame count: %d\n", frame_cnt);
+    que->finish();
+}
+
+void Mat_to_CHW(float *img_data, cv::Mat &frame)
+{
+    assert(img_data && !frame.empty());
+    unsigned int volChl = 608 * 608;
+
+    for(int c = 0; c < 3; ++c)
+    {
+        for (long j = 0; j < volChl; ++j)
+            img_data[c*volChl + j] = static_cast<float>(float(frame.data[j * 3 + c])/255.0);
+    }
+}
+
+typedef struct BoxInfo
+{
+	float x1;
+	float y1;
+	float x2;
+	float y2;
+	float score;
+	int label;
+} BoxInfo;
+
+void nms(vector<BoxInfo>& input_boxes)
+{
+        float nmsThreshold = 0.45;
+	sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; }); 
+	vector<float> vArea(input_boxes.size());  
+	for (int i = 0; i < input_boxes.size(); ++i)
+	{
+		vArea[i] = (input_boxes[i].x2 - input_boxes[i].x1 + 1)* (input_boxes[i].y2 - input_boxes[i].y1 + 1);
+	}
+	
+	vector<bool> isSuppressed(input_boxes.size(), false);  
+	for (int i = 0; i < input_boxes.size(); ++i)
+	{
+		if (isSuppressed[i]) { continue; }
+		for (int j = i + 1; j < input_boxes.size(); ++j)
+		{
+			if (isSuppressed[j]) { continue; }
+			float xx1 = max(input_boxes[i].x1, input_boxes[j].x1);
+			float yy1 = max(input_boxes[i].y1, input_boxes[j].y1);
+			float xx2 = min(input_boxes[i].x2, input_boxes[j].x2);
+			float yy2 = min(input_boxes[i].y2, input_boxes[j].y2);
+			float w = max(0.0f, xx2 - xx1 + 1);
+			float h = max(0.0f, yy2 - yy1 + 1);
+			float inter = w * h;	
+			
+			if(input_boxes[i].label == input_boxes[j].label)  
+			{
+				float ovr = inter / (vArea[i] + vArea[j] - inter);  
+				if (ovr >= nmsThreshold)
+				{
+					isSuppressed[j] = true;
+				}
+			}	
+		}
+	}
+	int idx_t = 0;   
+	input_boxes.erase(remove_if(input_boxes.begin(), input_boxes.end(), [&idx_t, &isSuppressed](const BoxInfo& f) { return isSuppressed[idx_t++]; }), input_boxes.end());
+}
+
+void DeployGraphExecutor(cv::Mat &srcImage, std::vector<BoxInfo> &generate_boxes)
+{
+    // load in the libr
+    DLDevice dev{kDLROCM, 0};
+    tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("lib/yolov5s_miopen_rocblas.so");
+  
+    // create the graph executor module
+    tvm::runtime::Module gmod = mod_factory.GetFunction("default")(dev);
+
+    tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input");
+    tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output");
+    tvm::runtime::PackedFunc run = gmod.GetFunction("run");
+    cv::Mat in_put;
+    cv::resize(srcImage, in_put, cv::Size(608, 608));
+    float img_data[608*608*3];
+    Mat_to_CHW(img_data, in_put);
+  
+    DLTensor* y;
+    int out_ndim = 3;
+    int64_t out_shape[3] = {1, 22743, 85};
+    int dtype_code = kDLFloat;
+    int dtype_bits = 32;
+    int dtype_lanes = 1;
+    int device_type = kDLROCM;
+    int device_id = 0;
+    TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+
+    DLTensor* x;
+    int ndim = 4;
+    int64_t shape[4] = {1, 3 ,608, 608};
+    TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+    memcpy(x->data,&img_data,3*608*608*sizeof(float));
+
+    // set the right input
+    set_input("images", x);
+    // run the code
+    run();
+    //get the output
+    get_output(0, y);
+
+    static float result[22743][85] = {0};
+    TVMArrayCopyToBytes(y, result, 22743 * 85 * sizeof(float));
+    int num_proposal = sizeof(result)/sizeof(result[0]); //22743
+    int box_classes = sizeof(result[0])/sizeof(result[0][0]);//85
+    float* pdata = result[0];
+    float ratioh = (float)srcImage.rows / 608, ratiow = (float)srcImage.cols / 608;
+    float objThreshold=0.5, confThreshold=0.5;
+    for(int i=0;i<num_proposal;i++)
+    {
+        int index = i*box_classes;
+        float obj_conf = pdata[index+4];  //置信度分数
+        if(obj_conf > objThreshold)
+        {
+           int class_idx = 0;
+           float max_class_socre = 0;
+           for (int k = 0; k < 80; ++k)
+           {
+               if (pdata[k + index + 5] > max_class_socre)
+               {
+                   max_class_socre = pdata[k + index + 5];
+                   class_idx = k;
+               }
+           }
+           if (max_class_socre > confThreshold)
+           {
+               float cx = pdata[index];
+               float cy = pdata[index+1];
+               float w = pdata[index+2];
+               float h = pdata[index+3];
+               float xmin = (cx - 0.5 * w)*ratiow;  // *ratiow，变回原图尺寸
+               float ymin = (cy - 0.5 * h)*ratioh;    
+               float xmax = (cx + 0.5 * w)*ratiow;
+               float ymax = (cy + 0.5 * h)*ratioh;
+               generate_boxes.push_back(BoxInfo{ xmin, ymin, xmax, ymax, max_class_socre, class_idx });
+           }
+        }
+    }
+    nms(generate_boxes);
+}
+
+int main()
+{
+    Queue* queue = new Queue(1);
+    std::thread ThreadDecoder(DecoderThreadFunc, queue);
+
+    int frame_cnt = 0;
+    double start_time = getTickCount();
+    while (!queue->DecodeEnd)
+    {
+        cv::Mat srcImage;
+        queue->deQueue(&srcImage);
+	if (srcImage.empty()) {
+	    continue;
+	}
+
+        std::vector<BoxInfo> generate_boxes;
+        double time1 = getTickCount();
+        DeployGraphExecutor(srcImage, generate_boxes);
+        double time2 = getTickCount();
+	double elapsedTime = (time2 - time1)*1000 / getTickFrequency();
+	fprintf(stdout, "inference time:%f ms\n", elapsedTime);
+	frame_cnt++;
+
+        // postprocess
+        fprintf(stdout,"////////////////Detection Results////////////////\n");
+        for(size_t i=0;i<generate_boxes.size();i++)
+        {
+            BoxInfo result = generate_boxes[i];
+            rectangle(srcImage, Point(result.x1, result.y1), Point(int(result.x2), int(result.y2)), Scalar(0, 0, 255), 2);
+            string score = format("score:%.2f", result.score);
+            string label = format("label:%d", result.label);
+            putText(srcImage, score, Point(result.x1, result.y1 - 5), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
+            putText(srcImage, label, Point(result.x1 + 130, result.y1 - 5), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
+
+            fprintf(stdout, "detector result[%d] box:%.1f %.1f %.1f %.1f, label:%d, confidence:%.2f\n", i,
+            result.x1, result.x2, result.y1, result.y2, result.label, result.score);
+        }
+        // save result
+        /*char out[20];
+        snprintf(out, sizeof(out), "out/Frame%d.jpg", frame_cnt);
+        imwrite(out, srcImage);*/
+    }
+    double end_time = getTickCount();
+    fprintf(stdout, "Finish ####### frame_cnt: %d, Inference fps: %.2f, all time: %.2f ms\n", frame_cnt, float(frame_cnt/((end_time - start_time)/getTickFrequency())), (end_time - start_time)/getTickFrequency()*1000);
+
+    ThreadDecoder.join();
+    delete queue;
+    queue = NULL;
+    return 0;
+}
--- a/src/yolov7t_deploy.cc
+++ b/src/yolov7t_deploy.cc
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <cstdio>
+#include <fstream>
+#include </usr/include/opencv2/opencv.hpp>
+#include </usr/include/opencv2/highgui/highgui.hpp>  
+#include </usr/include/opencv2/imgproc/imgproc.hpp> 
+#include <iostream>
+#include <typeinfo>
+#include <algorithm>
+#include <vector>
+#include <algorithm>
+#include <Decoder.h>
+#include <Queuethread.h>
+#include <thread>
+
+using namespace cv;
+using namespace std;
+
+static void DecoderThreadFunc(Queue* queue)
+{
+    int ret, end = 0;
+    int frame_cnt = 0;
+    Queue* que = queue;
+    Decoder decoder;
+    decoder.DecoderInit();
+    while(true)
+    {
+        if (av_read_frame(decoder.fmt_ctx, decoder.pkt) < 0)
+        {
+            if(end == 2)
+            {
+                que->DecodeEnd = true;
+                break;
+            }
+           end = 1;
+        }
+        if (decoder.pkt->stream_index == decoder.video_stream_idx) {
+            if(!end) {
+                ret = avcodec_send_packet(decoder.video_dec_ctx, decoder.pkt);
+            } else {
+                ret = avcodec_send_packet(decoder.video_dec_ctx, NULL);
+            }
+            if (ret < 0 && ret != AVERROR_EOF) {
+                fprintf(stderr, "Error submitting a packet for decoding\n");
+                que->DecodeEnd = true;
+                break;
+            }
+            while (ret >= 0 || end == 1)
+            {
+                ret = avcodec_receive_frame(decoder.video_dec_ctx, decoder.frame);
+                if (ret == AVERROR(EAGAIN)) {
+                    break;
+                } else if (ret == AVERROR_EOF ) {
+                    end = 2;
+                    break;
+                } else if (ret < 0) {
+                    av_log(NULL, AV_LOG_ERROR, "Error while receiving a frame from the decoder\n");
+                    que->finish();
+                    return;
+                }
+                decoder.frame->pts = decoder.frame->best_effort_timestamp;
+                frame_cnt++;
+
+                cv::Mat srcImage = cv::Mat::zeros(decoder.frame->height*3/2, decoder.frame->width, CV_8UC1);
+                memcpy(srcImage.data, (unsigned char*)decoder.frame->data[0], decoder.frame->width * decoder.frame->height);
+                memcpy(srcImage.data + decoder.frame->width * decoder.frame->height, (unsigned char*)decoder.frame->data[1], decoder.frame->width * decoder.frame->height/4);
+                memcpy(srcImage.data + decoder.frame->width * decoder.frame->height*5/4, (unsigned char*)decoder.frame->data[2], decoder.frame->width * decoder.frame->height/4);
+                cvtColor(srcImage, srcImage, COLOR_YUV420p2RGB);
+                que->enQueue(srcImage);
+                av_frame_unref(decoder.frame);
+            }
+        }
+        av_packet_unref(decoder.pkt);
+    }
+    fprintf(stdout, "Decoder: ####### frame count: %d\n", frame_cnt);
+    que->finish();
+}
+
+void Mat_to_CHW(float *img_data, cv::Mat &frame)
+{
+    assert(img_data && !frame.empty());
+    unsigned int volChl = 640 * 640;
+
+    for(int c = 0; c < 3; ++c)
+    {
+        for (long j = 0; j < volChl; ++j)
+            img_data[c*volChl + j] = static_cast<float>(float(frame.data[j * 3 + c])/255.0);
+    }
+}
+
+typedef struct BoxInfo
+{
+	float x1;
+	float y1;
+	float x2;
+	float y2;
+	float score;
+	int label;
+} BoxInfo;
+
+void nms(vector<BoxInfo>& input_boxes)
+{
+        float nmsThreshold = 0.45;
+	sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; }); 
+	vector<float> vArea(input_boxes.size());  
+	for (int i = 0; i < input_boxes.size(); ++i)
+	{
+		vArea[i] = (input_boxes[i].x2 - input_boxes[i].x1 + 1)* (input_boxes[i].y2 - input_boxes[i].y1 + 1);
+	}
+	
+	vector<bool> isSuppressed(input_boxes.size(), false);  
+	for (int i = 0; i < input_boxes.size(); ++i)
+	{
+		if (isSuppressed[i]) { continue; }
+		for (int j = i + 1; j < input_boxes.size(); ++j)
+		{
+			if (isSuppressed[j]) { continue; }
+			float xx1 = max(input_boxes[i].x1, input_boxes[j].x1);
+			float yy1 = max(input_boxes[i].y1, input_boxes[j].y1);
+			float xx2 = min(input_boxes[i].x2, input_boxes[j].x2);
+			float yy2 = min(input_boxes[i].y2, input_boxes[j].y2);
+			float w = max(0.0f, xx2 - xx1 + 1);
+			float h = max(0.0f, yy2 - yy1 + 1);
+			float inter = w * h;	
+			
+			if(input_boxes[i].label == input_boxes[j].label)  
+			{
+				float ovr = inter / (vArea[i] + vArea[j] - inter);  
+				if (ovr >= nmsThreshold)
+				{
+					isSuppressed[j] = true;
+				}
+			}	
+		}
+	}
+	int idx_t = 0;   
+	input_boxes.erase(remove_if(input_boxes.begin(), input_boxes.end(), [&idx_t, &isSuppressed](const BoxInfo& f) { return isSuppressed[idx_t++]; }), input_boxes.end());
+}
+
+void DeployGraphExecutor(cv::Mat &srcImage, std::vector<BoxInfo> &generate_boxes)
+{
+    // load in the libr
+    DLDevice dev{kDLROCM, 0};
+    tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("lib/yolov7t_miopen_rocblas.so");
+  
+    // create the graph executor module
+    tvm::runtime::Module gmod = mod_factory.GetFunction("default")(dev);
+
+    tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input");
+    tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output");
+    tvm::runtime::PackedFunc run = gmod.GetFunction("run");
+    cv::Mat in_put;
+    cv::resize(srcImage, in_put, cv::Size(640, 640));
+    float img_data[640*640*3];
+    Mat_to_CHW(img_data, in_put);
+  
+    DLTensor* y;
+    int out_ndim = 3;
+    int64_t out_shape[3] = {1, 25200, 85};
+    int dtype_code = kDLFloat;
+    int dtype_bits = 32;
+    int dtype_lanes = 1;
+    int device_type = kDLROCM;
+    int device_id = 0;
+    TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+
+    DLTensor* x;
+    int ndim = 4;
+    int64_t shape[4] = {1, 3 ,640, 640};
+    TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+    memcpy(x->data,&img_data,3*640*640*sizeof(float));
+
+    // set the right input
+    set_input("images", x);
+    // run the code
+    run();
+    //get the output
+    get_output(0, y);
+
+    static float result[25200][85] = {0};
+    TVMArrayCopyToBytes(y, result, 25200 * 85 * sizeof(float));
+    int num_proposal = sizeof(result)/sizeof(result[0]); //25200
+    int box_classes = sizeof(result[0])/sizeof(result[0][0]);//85
+    float* pdata = result[0];
+    float ratioh = (float)srcImage.rows / 640, ratiow = (float)srcImage.cols / 640;
+    float objThreshold=0.5, confThreshold=0.5;
+    for(int i=0;i<num_proposal;i++)
+    {
+        int index = i*box_classes;
+        float obj_conf = pdata[index+4];  //置信度分数
+        if(obj_conf > objThreshold)
+        {
+           int class_idx = 0;
+           float max_class_socre = 0;
+           for (int k = 0; k < 80; ++k)
+           {
+               if (pdata[k + index + 5] > max_class_socre)
+               {
+                   max_class_socre = pdata[k + index + 5];
+                   class_idx = k;
+               }
+           }
+           if (max_class_socre > confThreshold)
+           {
+               float cx = pdata[index];
+               float cy = pdata[index+1];
+               float w = pdata[index+2];
+               float h = pdata[index+3];
+               float xmin = (cx - 0.5 * w)*ratiow;  // *ratiow，变回原图尺寸
+               float ymin = (cy - 0.5 * h)*ratioh;    
+               float xmax = (cx + 0.5 * w)*ratiow;
+               float ymax = (cy + 0.5 * h)*ratioh;
+               generate_boxes.push_back(BoxInfo{ xmin, ymin, xmax, ymax, max_class_socre, class_idx });
+           }
+        }
+    }
+    nms(generate_boxes);
+}
+
+int main()
+{
+    Queue* queue = new Queue(1);
+    std::thread ThreadDecoder(DecoderThreadFunc, queue);
+
+    int frame_cnt = 0;
+    double start_time = getTickCount();
+    while (!queue->DecodeEnd)
+    {
+        cv::Mat srcImage;
+        queue->deQueue(&srcImage);
+	if (srcImage.empty()) {
+	    continue;
+	}
+
+        std::vector<BoxInfo> generate_boxes;
+        double time1 = getTickCount();
+        DeployGraphExecutor(srcImage, generate_boxes);
+        double time2 = getTickCount();
+	double elapsedTime = (time2 - time1)*1000 / getTickFrequency();
+	fprintf(stdout, "inference time:%f ms\n", elapsedTime);
+	frame_cnt++;
+
+        // postprocess
+        fprintf(stdout,"////////////////Detection Results////////////////\n");
+        for(size_t i=0;i<generate_boxes.size();i++)
+        {
+            BoxInfo result = generate_boxes[i];
+            rectangle(srcImage, Point(result.x1, result.y1), Point(int(result.x2), int(result.y2)), Scalar(0, 0, 255), 2);
+            string score = format("score:%.2f", result.score);
+            string label = format("label:%d", result.label);
+            putText(srcImage, score, Point(result.x1, result.y1 - 5), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
+            putText(srcImage, label, Point(result.x1 + 130, result.y1 - 5), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);
+
+            fprintf(stdout, "detector result[%d] box:%.1f %.1f %.1f %.1f, label:%d, confidence:%.2f\n", i,
+            result.x1, result.x2, result.y1, result.y2, result.label, result.score);
+        }
+        // save result
+        /*char out[20];
+        snprintf(out, sizeof(out), "out/Frame%d.jpg", frame_cnt);
+        imwrite(out, srcImage);*/
+    }
+    double end_time = getTickCount();
+    fprintf(stdout, "Finish ####### frame_cnt: %d, Inference fps: %.2f, all time: %.2f ms\n", frame_cnt, float(frame_cnt/((end_time - start_time)/getTickFrequency())), (end_time - start_time)/getTickFrequency()*1000);
+
+    ThreadDecoder.join();
+    delete queue;
+    queue = NULL;
+    return 0;
+}