mobilenet_v2_tvm code

3eef8b26 · zhanggezhong · 52936a19 · 3eef8b26 · 3eef8b26 · 3eef8b26
Commit 3eef8b26 authored Oct 17, 2023 by zhanggezhong
11 changed files
--- a/Makefile
+++ b/Makefile
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Makefile Example to deploy TVM modules.
+TVM_ROOT=$(shell cd ../..; pwd)
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
+ROCM_ROOT=/opt/dtk
+OPENCV_INCLUDE = $(shell pkg-config --cflags opencv)
+OPENCV_LIBS = $(shell pkg-config --libs opencv)
+
+PKG_CFLAGS = -std=c++17 -O2 -fPIC\
+	-I${TVM_ROOT}/include\
+	-I${DMLC_CORE}/include\
+	-I${ROCM_ROOT}/include\
+	-I${OPENCV_INCLUDE}\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>
+	
+PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -pthread\
+	      -L${ROCM_ROOT}/lib -lamdhip64\
+	      -L${OPENCV_LIBS}\
+	      -L${ROCM_ROOT}/miopen/lib -lMIOpen\
+	      -L${ROCM_ROOT}/rocblas/lib -lrocblas
+
+
+.PHONY: clean all
+
+all: lib/libtvm_runtime_pack.o lib/MobileNet_V2_deploy 
+
+# Build rule for all in one TVM package library
+.PHONY: lib/libtvm_runtime_pack.o
+lib/libtvm_runtime_pack.o: tvm_runtime_pack.cc
+	@mkdir -p $(@D)
+	$(CXX) -c $(PKG_CFLAGS) -o $@  $^
+
+# Deploy using the all in one TVM package library
+.PHONY: lib/MobileNet_V2_deploy
+lib/MobileNet_V2_deploy: MobileNet_V2_deploy.cc lib/libtvm_runtime_pack.o
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -o $@  $^ $(PKG_LDFLAGS)
+
+clean:
+	rm -rf lib
--- a/MobileNet_V2_deploy.cc
+++ b/MobileNet_V2_deploy.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Example code on load and run TVM module.s
+ * \file cpp_deploy.cc
+ */
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <cstdio>
+#include <fstream>
+#include </usr/include/opencv2/opencv.hpp>
+#include </usr/include/opencv2/highgui/highgui.hpp>  
+#include </usr/include/opencv2/imgproc/imgproc.hpp> 
+#include <iostream>
+#include <typeinfo>
+#include <algorithm>
+
+
+using namespace cv;
+void Verify(tvm::runtime::Module mod, std::string fname) {
+  // Get the function from the module.
+  tvm::runtime::PackedFunc f = mod.GetFunction(fname);
+  ICHECK(f != nullptr);
+  // Allocate the DLPack data structures.
+  //
+  // Note that we use TVM runtime API to allocate the DLTensor in this example.
+  // TVM accept DLPack compatible DLTensors, so function can be invoked
+  // as long as we pass correct pointer to DLTensor array.
+  //
+  // For more information please refer to dlpack.
+  // One thing to notice is that DLPack contains alignment requirement for
+  // the data pointer and TVM takes advantage of that.
+  // If you plan to use your customized data container, please
+  // make sure the DLTensor you pass in meet the alignment requirement.
+  //
+  DLTensor* x;
+  DLTensor* y;
+  int ndim = 1;
+  int dtype_code = kDLFloat;
+  int dtype_bits = 32;
+  int dtype_lanes = 1;
+  int device_type = kDLCPU;
+  int device_id = 0;
+  int64_t shape[1] = {10};
+  TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+  TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+  for (int i = 0; i < shape[0]; ++i) {
+    static_cast<float*>(x->data)[i] = i;
+  }
+  // Invoke the function
+  // PackedFunc is a function that can be invoked via positional argument.
+  // The signature of the function is specified in tvm.build
+  f(x, y);
+  // Print out the output
+  for (int i = 0; i < shape[0]; ++i) {
+    ICHECK_EQ(static_cast<float*>(y->data)[i], i + 1.0f);
+  }
+  LOG(INFO) << "Finish verification...";
+  TVMArrayFree(x);
+  TVMArrayFree(y);
+}
+
+void DeploySingleOp() {
+  // Normally we can directly
+  tvm::runtime::Module mod_dylib = tvm::runtime::Module::LoadFromFile("lib/test_addone_dll.so");
+  LOG(INFO) << "Verify dynamic loading from test_addone_dll.so";
+  Verify(mod_dylib, "addone");
+  // For libraries that are directly packed as system lib and linked together with the app
+  // We can directly use GetSystemLib to get the system wide library.
+  LOG(INFO) << "Verify load function from system lib";
+  tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("runtime.SystemLib"))();
+  Verify(mod_syslib, "addonesys");
+}
+
+void PreProcess(const Mat& image, Mat& image_blob)
+{
+	Mat input;
+	image.copyTo(input);
+
+	std::vector<Mat> channels, channel_p;
+	split(input, channels);
+	Mat R, G, B;
+	B = channels.at(0);
+	G = channels.at(1);
+	R = channels.at(2);
+
+	B = (B / 255. - 0.406) / 0.225;
+	G = (G / 255. - 0.456) / 0.224;
+	R = (R / 255. - 0.485) / 0.229;
+
+	channel_p.push_back(R);
+	channel_p.push_back(G);
+	channel_p.push_back(B);
+
+	Mat outt;
+	merge(channel_p, outt);
+	image_blob = outt;
+}
+
+void Mat_to_CHW(float *img_data, cv::Mat &frame)
+{
+    assert(img_data && !frame.empty());
+    unsigned int volChl = 224 * 224;
+
+    for(int c = 0; c < 3; ++c)
+    {
+        for (unsigned j = 0; j < volChl; ++j)
+            img_data[c*volChl + j] = static_cast<float>(float(frame.data[j * 3 + c])/255.0);
+    }
+
+}
+
+void DeployGraphExecutor() {
+  LOG(INFO) << "Running graph executor...";
+  // load in the library
+
+  DLDevice dev{kDLROCM, 0};
+  tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("lib/MobileNet_V2.so");
+  
+  // create the graph executor module
+  using namespace std;
+  tvm::runtime::Module gmod = mod_factory.GetFunction("default")(dev);
+  cout<<"---------------"<<endl;
+
+  tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input");
+  tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output");
+  tvm::runtime::PackedFunc run = gmod.GetFunction("run");
+//  cv::Mat image = cv::imread("/home/linjq/ambulance.jpg");
+  cv::Mat image = cv::imread("./cow.jpg");
+  cv::Mat in_put;
+  cv::Mat img_in;
+  cv::resize(image, in_put, cv::Size(224, 224));
+  //cv::cvtColor(frame, in_put, cv::COLOR_BGR2RGB);  
+
+  float img_data[224*224*3];
+//  PreProcess(in_put, img_in);
+//  Mat_to_CHW(img_data, img_in);
+  Mat_to_CHW(img_data, in_put);
+  
+  DLTensor* y;
+  int out_ndim = 2;
+  int64_t out_shape[2] = {1, 1000};
+  int dtype_code = kDLFloat;
+  int dtype_bits = 32;
+  int dtype_lanes = 1;
+  int device_type = kDLROCM;
+  int device_id = 0;
+  TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
+  
+
+
+  
+  DLTensor* x;
+  int ndim = 4;
+  int64_t shape[4] = {1, 3 ,224, 224};
+  TVMArrayAlloc(shape, ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
+  memcpy(x->data,&img_data,3*224*224*sizeof(float));
+  
+  // set the right input
+  set_input("data", x);
+  // run the code
+  run();
+  //get the output
+  
+  get_output(0, y);
+
+  float* result = new float[1000];
+  TVMArrayCopyToBytes(y, result, 1000 * sizeof(float));
+
+  float max_num = *max_element(result, result+1000);
+//  int max_num_index = max_element(result,result+1000)-result;
+  auto max_iter = std::max_element(result, result + 1000);
+  auto max_num_index = std::distance(result, max_iter);
+  cout<<"max_num:"<<max_num<<endl;
+  cout<<"max_iter:"<<max_iter<<endl;
+  cout<<"max_num_index:"<<max_num_index<<endl;
+}
+
+int main(void) {
+  //DeploySingleOp();
+  DeployGraphExecutor();
+  return 0;
+}
--- a/README.md
+++ b/README.md
+# TVM 
+## 模型介绍
+```
+   ResNet-50v2是ResNet系列中的一个经典模型，由50层卷积层、批量归一化、激活函数和池化层构成。它引入了一种全新的残差块结构，
+即bottleneck结构，使得网络参数量大幅度降低，同时精度也有所提升，ResNet-50v2适用于各种图像分类任务。本示例为使用TVM对训练
+好的ResNet-50v2 onnx格式的模型文件，进行推理调优及部署的流程。
+```
+## 模型结构
+```
+    ResNet50-v2
+```
+## 模型文件
+  模型文件下载地址： 
+```
+    "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx"
+```
+## 数据集
+  python 推理及调优代码使用的图片数据为：
+```
+    "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
+``` 
+  标签数据为：
+```
+    "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
+```
+   C++部署代码使用数据为：
+```
+    "https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128.zip"
+```
+## 推理、自动调优及部署
+###  环境配置
+  拉取镜像：
+```
+    docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:tvm-0.11_fp32_cpp_dtk22.10_py38_centos-7.6-latest
+```
+
+###  执行推理及调优
+  下载模型文件后执行以下命令进行推理测试及调优测试：
+```
+    python tune_resnet50-v2.py
+```
+    
+    
+### 单卡部署推理测试
+  下载配置好镜像之后，cd /tvm-0.11-dev0/apps/ 进入该路径下，将代码下载放到该路径下，cd tvm_tune_resnet50-v2/ 进入该路径后，
+执行以下命令：
+```
+    mkdir -p lib
+    python prepare_test_libs.py
+    sh run_example.sh
+```
+## 准确率数据
+```
+    max_num:15.6692
+    max_iter:0x28cda14
+    max_num_index:345
+```
+## TVM版本
+```
+    TVM-0.11
+```   
+    
+
+## 源码仓库及问题反馈
+
+   * https://developer.hpccube.com/codes/modelzoo/tvm_tune_resnet50-v2
+
+## 参考
+
+   * [https://tvm.apache.org/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py]()
+
--- a/cow.jpg
+++ b/cow.jpg
--- a/kitten.jpg
+++ b/kitten.jpg
--- a/model.properties
+++ b/model.properties
+#模型名称
+modelName=MobileNet_V2_TVM
+#模型描述
+modelDescription=MobileNet_V2是一种用于图像识别的深度神经网络模型
+# 应用场景(多个标签以英文逗号分割)
+appScenario=CV
+# 框架类型(多个标签以英文逗号分割)
+frameType=onnx
--- a/prepare_test_libs.py
+++ b/prepare_test_libs.py
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Script to prepare test_addone.so"""
+import tvm
+import numpy as np
+from tvm import te
+from tvm import relay
+import os
+
+import onnx
+img_data = np.random.rand(1,3,224,224).astype("float32")/255
+input_name = "data"
+shape_dict = {input_name: img_data.shape}
+input_shape = img_data.shape
+print("input shape",img_data.shape)
+#mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype=dtype)
+model_path = "/mobilenetv2-7.onnx"
+onnx_model = onnx.load(model_path)
+#target = "rocm -libs=miopen,rocblas"
+np.random.seed(0)
+dtype = "float32"
+#img_data = np.random.rand(1,3,224,224).astype("float32")/255
+
+def prepare_test_libs(base_path):
+    #n = te.var("n")
+    #A = te.placeholder((n,), name="A")
+    #B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+    #s = te.create_schedule(B.op)
+    # Compile library as dynamic library
+    mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype=dtype)
+    fadd_dylib = tvm.build(mod, params, "rocm -libs=miopen,rocblas", name="addone")
+    dylib_path = os.path.join(base_path, "MobileNet_V2_addone_dll.so")
+    fadd_dylib.export_library(dylib_path)
+
+    # Compile library in system library mode
+    fadd_syslib = relay.build(mod, params, "rocm -libs=miopen,rocblas", name="add_resnet50")
+    syslib_path = os.path.join(base_path, "MobileNet_V2_addone_sys.o")
+    fadd_syslib.save(syslib_path)
+
+
+def prepare_graph_lib(base_path):
+    #x = relay.var("x", shape=(2, 2), dtype="float32")
+    #y = relay.var("y", shape=(2, 2), dtype="float32")
+    #params = {"y": np.ones((2, 2), dtype="float32")}
+    #mod = tvm.IRModule.from_expr(relay.Function([x, y], x + y))
+    mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype=dtype)
+    # build a module
+    #with tvm.transform.PassContext(opt_level=3):
+    compiled_lib = relay.build(mod, tvm.target.Target("rocm -libs=miopen,rocblas"), params=params)
+    # export it as a shared library
+    # If you are running cross compilation, you can also consider export
+    # to tar and invoke host compiler later.
+    dylib_path = os.path.join(base_path, "MobileNet_V2.so")
+    compiled_lib.export_library(dylib_path)
+
+
+if __name__ == "__main__":
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    #prepare_test_libs(os.path.join(curr_path, "lib"))
+    prepare_graph_lib(os.path.join(curr_path, "lib"))
--- a/run_example.sh
+++ b/run_example.sh
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+echo "Build the libraries.."
+mkdir -p lib
+make
+echo "Run the example"
+export LD_LIBRARY_PATH=../../build:${LD_LIBRARY_PATH}
+export DYLD_LIBRARY_PATH=../../build:${DYLD_LIBRARY_PATH}
+
+#echo "Run the deployment with all in one packed library..."
+#lib/cpp_deploy_pack
+#python prepare_test_libs.py
+#echo "Run the cpp deployment with all in normal library..."
+#lib/cpp_deploy_normal
+echo "Run the cpp deployment with all in normal library..."
+lib/MobileNet_V2_deploy
+#echo "Run the python deployment with all in normal library..."
+#python3 python_deploy.py
--- a/synset.txt
+++ b/synset.txt
--- a/tune_MobileNet_V2.py
+++ b/tune_MobileNet_V2.py
+from tvm import testing
+import onnx
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+from PIL import Image
+import numpy as np
+from scipy.special import softmax
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_executor
+import cv2
+def get_network(name, batch_size, layout="NCHW", dtype="float32"):
+    # auto-scheduler prefers NHWC layout
+    #根据实际情况修改输入维度
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+    if name == "MobileNet_V2":
+        mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype=dtype)
+       
+    return mod, params, input_shape, output_shape
+
+model_path = "mobilenetv2-7.onnx"
+onnx_model = onnx.load(model_path)
+np.random.seed(0)
+
+def readimage(pathOfImage,GRAY=False,inputShape=[1,3,128,128]):
+    if GRAY==True:
+        srcImage = cv2.imread(pathOfImage, cv2.IMREAD_GRAYSCALE)
+        print("srcImage.shape:",srcImage.shape)
+
+        resizedImage = cv2.resize(srcImage,(inputShape[3], inputShape[2]))
+        resizedImage_Float = resizedImage.astype("float32")
+        srcImage_CHW = resizedImage_Float[None]
+
+    else :
+        srcImage = cv2.imread(pathOfImage, cv2.IMREAD_COLOR) # numpy类型,HWC
+        # resize并转换为CHW
+        resizedImage = cv2.resize(srcImage,(inputShape[3], inputShape[2]))
+        resizedImage_Float = resizedImage.astype("float32") # 转换为float32
+        srcImage_CHW = np.transpose(resizedImage_Float, (2, 0, 1)) # 转换为CHW
+
+    # 预处理
+    mean_vec = np.array([0.485, 0.456, 0.406])
+    stddev_vec = np.array([0.229, 0.224, 0.225])
+    inputData = np.zeros(inputShape).astype("float32") # NCHW
+    for i in range(srcImage_CHW.shape[0]):
+        inputData[0,i, :, :] = (srcImage_CHW[i,:,:]/255 - mean_vec[i]) / stddev_vec[i]
+
+    # 复制到batch中的其他图像
+    for i in range(inputData.shape[0]):
+        if i!=0:
+            inputData[i,:, :, :]=inputData[0,:, :, :]
+
+    return inputData
+
+#Download the image data, then convert it to a numpy array to use as an input to the model.
+
+#img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
+img_path = "kitten.jpg"
+#img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
+network = "MobileNet_V2"
+dtype = "float32"
+#target = "rocm"
+target = "rocm -libs=miopen,rocblas"
+input_name = "data"
+input_shape=[1,3,224,224]
+img_data=readimage(img_path,GRAY=False,inputShape=input_shape)
+batch_size = 1
+layout = "NCHW"
+shape_dict = {input_name: img_data.shape}
+input_shape = img_data.shape
+print("input shape",img_data.shape)
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+print("Compile...")
+with tvm.transform.PassContext(opt_level=3):
+    lib = relay.build(mod, target=target, params=params)
+print("Compile successed !")
+
+dev = tvm.device(str(target), 0)
+module = graph_executor.GraphModule(lib["default"](dev))
+
+module.set_input(input_name, img_data)
+module.run()
+output_shape = (1, 1000)
+tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
+
+# Download a list of labels
+#labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
+labels_path = "/synset.txt"
+#labels_path = download_testdata(labels_url, "synset.txt", module="data")
+with open(labels_path, "r") as f:
+    labels = [l.rstrip() for l in f]
+
+# Open the output and read the output tensor
+scores = softmax(tvm_output)
+scores = np.squeeze(scores)
+ranks = np.argsort(scores)[::-1]
+
+print('class=%s ; probability=%f' %(labels[ranks[0]],scores[ranks[0]]))
+
+# Evaluate
+print("Evaluate inference time cost...")
+print(module.benchmark(dev, repeat=100, min_repeat_ms=500))
+
+
+log_file = "%s-%s-B%d.json" % (network, layout, batch_size)
+print("log_file name is {}".format(log_file))
+
+print("Extract tasks...")
+
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+# Begin Tuning
+def run_tuning():
+    print("Begin tuning...")
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=2000,  # change this to 20000 to achieve the best performance
+        runner=measure_ctx.runner,
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+run_tuning()
+
+# Compile with the history best
+print("Compile...")
+
+with auto_scheduler.ApplyHistoryBest(log_file):
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        lib = relay.build(mod, target=target, params=params)
+print("Compile success !")
+
+labels_path = "synset.txt"
+#labels_path = download_testdata(labels_url, "synset.txt", module="data")
+with open(labels_path, "r") as f:
+    labels = [l.rstrip() for l in f]
+dtype = "float32"
+module.set_input(input_name, img_data)
+module.run()
+output_shape = (1, 1000)
+tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
+# Open the output and read the output tensor
+scores = softmax(tvm_output)
+scores = np.squeeze(scores)
+ranks = np.argsort(scores)[::-1]
+
+print('class=%s ; probability=%f' %(labels[ranks[0]],scores[ranks[0]]))
+
+# Evaluate
+print("Evaluate inference time cost...")
+print(module.benchmark(dev, repeat=100, min_repeat_ms=500))
+
--- a/tvm_runtime_pack.cc
+++ b/tvm_runtime_pack.cc
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief This is an all in one TVM runtime file.
+ *
+ *   You only have to use this file to compile libtvm_runtime to
+ *   include in your project.
+ *
+ *  - Copy this file into your project which depends on tvm runtime.
+ *  - Compile with -std=c++17
+ *  - Add the following include path
+ *     - /path/to/tvm/include/
+ *     - /path/to/tvm/3rdparty/dmlc-core/include/
+ *     - /path/to/tvm/3rdparty/dlpack/include/
+ *   - Add -lpthread -ldl to the linked library.
+ *   - You are good to go.
+ *   - See the Makefile in the same folder for example.
+ *
+ *  The include files here are presented with relative path
+ *  You need to remember to change it to point to the right file.
+ *
+ */
+#define TVM_USE_LIBBACKTRACE 0
+#include "../../src/runtime/c_runtime_api.cc"
+#include "../../src/runtime/container.cc"
+#include "../../src/runtime/cpu_device_api.cc"
+#include "../../src/runtime/file_utils.cc"
+#include "../../src/runtime/library_module.cc"
+#include "../../src/runtime/logging.cc"
+#include "../../src/runtime/module.cc"
+#include "../../src/runtime/ndarray.cc"
+#include "../../src/runtime/object.cc"
+#include "../../src/runtime/registry.cc"
+#include "../../src/runtime/thread_pool.cc"
+#include "../../src/runtime/threading_backend.cc"
+#include "../../src/runtime/workspace_pool.cc"
+
+// NOTE: all the files after this are optional modules
+// that you can include remove, depending on how much feature you use.
+
+// Likely we only need to enable one of the following
+// If you use Module::Load, use dso_module
+// For system packed library, use system_lib_module
+#include "../../src/runtime/dso_library.cc"
+#include "../../src/runtime/system_library.cc"
+
+// Graph executor
+#include "../../src/runtime/graph_executor/graph_executor.cc"
+#include "../../src/runtime/graph_executor/graph_executor_factory.cc"
+
+// Uncomment the following lines to enable RPC
+// #include "../../src/runtime/rpc/rpc_session.cc"
+// #include "../../src/runtime/rpc/rpc_event_impl.cc"
+// #include "../../src/runtime/rpc/rpc_server_env.cc"
+
+// These macros enables the device API when uncommented.
+#define TVM_ROCM_RUNTIME 1
+#define TVM_USE_MIOPEN 1
+#define TVM_USE_ROCBLAS 1
+#define __HIP_PLATFORM_HCC__ 1
+#define TVM_METAL_RUNTIME 1
+#define TVM_OPENCL_RUNTIME 1
+
+// Uncomment the following lines to enable Metal
+// #include "../../src/runtime/metal/metal_device_api.mm"
+// #include "../../src/runtime/metal/metal_module.mm"
+
+// Uncomment the following lines to enable CUDA
+// #include "../../src/runtime/cuda/cuda_device_api.cc"
+// #include "../../src/runtime/cuda/cuda_module.cc"
+
+// Uncomment the following lines to enable ROCM
+#include "../../src/runtime/rocm/rocm_device_api.cc"
+#include "../../src/runtime/rocm/rocm_module.cc"
+#include "../../src/runtime/contrib/miopen/conv_forward.cc"
+#include "../../src/runtime/contrib/miopen/miopen_utils.cc"
+#include "../../src/runtime/contrib/rocblas/rocblas.cc"
+// Uncomment the following lines to enable OpenCL
+// #include "../../src/runtime/opencl/opencl_device_api.cc"
+// #include "../../src/runtime/opencl/opencl_module.cc"