Benchmarks: Add benchmark - add source code of cudnn function micro benchmark (#78)

* Benchmarks: Add benchmark - add source code of cudnn function micro benchmark

Benchmarks: Add benchmark - add source code of cudnn function micro benchmark (#78)
* Benchmarks: Add benchmark - add source code of cudnn function micro benchmark
61c258fe · Yuting Jiang · GitHub · 5e9f948d · 61c258fe · 61c258fe
Unverified Commit 61c258fe authored Jun 01, 2021 by Yuting Jiang Committed by GitHub Jun 01, 2021
10 changed files
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/CMakeLists.txt
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.18)
+project(CudnnBenchmark LANGUAGES CUDA CXX)
+
+include(../cuda_common.cmake)
+
+SET(SRC "cudnn_helper.cpp" CACHE STRING "source file")
+SET(TARGET_NAME "cudnn_function" CACHE STRING "target name")
+
+find_package(CUDAToolkit REQUIRED)
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${NVCC_ARCHS_SUPPORTED}")
+add_library(${TARGET_NAME} SHARED ${SRC})
+link_directories( ${CUDAToolkit_LIBRARY_DIR} ${CUDAToolkit_TARGET_DIR})
+include_directories( ${CUDAToolkit_INCLUDE_DIRS})
+find_library(CUDNN_LIBRARY cudnn
+    HINTS ${CUDAToolkit_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+include(FetchContent)
+FetchContent_Declare(json
+  GIT_REPOSITORY https://github.com/ArthurSonzogni/nlohmann_json_cmake_fetchcontent
+  GIT_TAG v3.7.3)
+FetchContent_GetProperties(json)
+if(NOT json_POPULATED)
+  FetchContent_Populate(json)
+  add_subdirectory(${json_SOURCE_DIR} ${json_BINARY_DIR} EXCLUDE_FROM_ALL)
+endif()
+
+add_executable(CudnnBenchmark cudnn_test.cpp)   
+target_link_libraries(CudnnBenchmark ${TARGET_NAME} nlohmann_json::nlohmann_json CUDA::cudart ${CUDNN_LIBRARY}) 
+install(TARGETS CudnnBenchmark ${TARGET_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "cudnn_function.h"
+
+namespace cudnn_test {
+/**
+ * @brief Class of ConvolutionBackwardDataFunction
+ * @tparam T1 input data type
+ * @tparam T2 conv type
+ */
+template <typename T1, typename T2> class ConvolutionBackwardDataFunction : public CudnnFunction<T1, T2> {
+    cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
+    /**
+     * @brief Execute the kernel/function
+     */
+    virtual void kernel_entry() {
+        CHECK_CUDNN_ERROR(cudnnConvolutionBackwardData(
+            this->cudnn_handle, &this->alpha_, this->w_desc_.desc(), this->filter, this->x_desc_.desc(), this->x,
+            this->conv_desc_.desc(), this->bwd_data_algo_, this->fwd_workspace_, this->fwd_workspace_size_,
+            &this->beta_, this->h_desc_.desc(), this->h));
+    }
+    /**
+     * @brief Get and set convolution algorithm and workspace size used in cudnn convolution functions
+     */
+    virtual void get_workspace_size() {
+        bwd_data_algo_ = cudnnConvolutionBwdDataAlgo_t(this->algo_);
+        CHECK_CUDNN_ERROR(cudnnGetConvolutionBackwardDataWorkspaceSize(
+            this->cudnn_handle, this->w_desc_.desc(), this->x_desc_.desc(), this->conv_desc_.desc(),
+            this->h_desc_.desc(), this->bwd_data_algo_, &this->fwd_workspace_size_));
+    }
+
+  public:
+    /**
+     * @brief Construct a new Convolution Backward Data Function object
+     */
+    ConvolutionBackwardDataFunction() {}
+    /**
+     * @brief Construct a new Convolution Backward Data Function object
+     * @param  config         base class CudnnConfig object
+     */
+    ConvolutionBackwardDataFunction(CudnnConfig &config) : CudnnFunction<T1, T2>(config) {}
+};
+} // namespace cudnn_test
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "cudnn_function.h"
+
+namespace cudnn_test {
+/**
+ * @brief Class of ConvolutionBackwardFilterFunction
+ * @tparam T1 input data type
+ * @tparam T2 conv type
+ */
+template <typename T1, typename T2> class ConvolutionBackwardFilterFunction : public CudnnFunction<T1, T2> {
+    cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
+    /**
+     * @brief Execute the kernel/function
+     */
+    virtual void kernel_entry() {
+        CHECK_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
+            this->cudnn_handle, &this->alpha_, this->x_desc_.desc(), this->x, this->h_desc_.desc(), this->h,
+            this->conv_desc_.desc(), this->bwd_filter_algo_, this->fwd_workspace_, this->fwd_workspace_size_,
+            &this->beta_, this->w_desc_.desc(), this->filter));
+    }
+    /**
+     * @brief Get and set convolution algorithm and workspace size used in cudnn convolution functions
+     */
+    virtual void get_workspace_size() {
+        bwd_filter_algo_ = cudnnConvolutionBwdFilterAlgo_t(this->algo_);
+        CHECK_CUDNN_ERROR(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            this->cudnn_handle, this->x_desc_.desc(), this->h_desc_.desc(), this->conv_desc_.desc(),
+            this->w_desc_.desc(), this->bwd_filter_algo_, &this->fwd_workspace_size_));
+    }
+
+  public:
+    /**
+     * @brief Construct a new Convolution Backward Filter Function object
+     */
+    ConvolutionBackwardFilterFunction() {}
+    /**
+     * @brief Construct a new Convolution Backward Filter Function object
+     * @param  config         base class CudnnConfig object
+     */
+    ConvolutionBackwardFilterFunction(CudnnConfig &config) : CudnnFunction<T1, T2>(config) {}
+};
+} // namespace cudnn_test
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "cudnn_function.h"
+
+namespace cudnn_test {
+/**
+ * @brief Class of ConvolutionForwardFunction
+ * @tparam T1 input data type
+ * @tparam T2 conv type
+ */
+template <typename T1, typename T2> class ConvolutionForwardFunction : public CudnnFunction<T1, T2> {
+    cudnnConvolutionFwdAlgo_t fwd_algo_;
+    /**
+     * @brief Execute the kernel/function
+     */
+    virtual void kernel_entry() {
+        CHECK_CUDNN_ERROR(cudnnConvolutionForward(this->cudnn_handle, &this->alpha_, this->x_desc_.desc(), this->x,
+                                                  this->w_desc_.desc(), this->filter, this->conv_desc_.desc(),
+                                                  this->fwd_algo_, this->fwd_workspace_, this->fwd_workspace_size_,
+                                                  &this->beta_, this->h_desc_.desc(), this->h));
+    }
+    /**
+     * @brief Get and set convolution algorithm and workspace size used in cudnn convolution functions
+     */
+    virtual void get_workspace_size() {
+        fwd_algo_ = cudnnConvolutionFwdAlgo_t(this->algo_);
+        CHECK_CUDNN_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
+            this->cudnn_handle, this->x_desc_.desc(), this->w_desc_.desc(), this->conv_desc_.desc(),
+            this->h_desc_.desc(), this->fwd_algo_, &this->fwd_workspace_size_));
+    }
+
+  public:
+    /**
+     * @brief Construct a new Convolution Forward Function object
+     */
+    ConvolutionForwardFunction() {}
+    /**
+     * @brief Construct a new Convolution Forward Function object
+     * @param  config         base class CudnnConfig object
+     */
+    ConvolutionForwardFunction(CudnnConfig &config) : CudnnFunction<T1, T2>(config) {}
+};
+} // namespace cudnn_test
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "cudnn_helper.h"
+
+namespace cudnn_test {
+/**
+ * @brief Enum of cudnn function name
+ */
+enum cudnn_function_name_enum {
+    e_cudnnConvolutionForward,
+    e_cudnnConvolutionBackwardData,
+    e_cudnnConvolutionBackwardFilter,
+};
+
+/**
+ * @brief Map from cudnn function name to cudnn function name enum
+ */
+static std::unordered_map<std::string, cudnn_function_name_enum> const cudnn_function_name_string = {
+    {"cudnnConvolutionForward", cudnn_function_name_enum::e_cudnnConvolutionForward},
+    {"cudnnConvolutionBackwardData", cudnn_function_name_enum::e_cudnnConvolutionBackwardData},
+    {"cudnnConvolutionBackwardFilter", cudnn_function_name_enum::e_cudnnConvolutionBackwardFilter},
+};
+
+/**
+ * @brief Class to store the configuration of cudnn function params
+ */
+class CudnnConfig {
+  protected:
+    int num_test;                    ///< the number of steps used to test and measure
+    int warm_up;                     ///< the number of steps used to warm up
+    int num_in_step;                 ///< the number of functions invoking in a step
+    int random_seed;                 ///< the random seed used to generate random data
+    std::string name;                ///< the name of the cudnn function
+    cudnn_function_name_enum e_name; ///< enum cudnn functin name
+    std::vector<int> input_dims_; ///< array of input dimension that contain the size of the tensor for every dimension
+    std::vector<int>
+        input_stride_; ///< array of input dimension that contain the stride of the tensor for every dimension
+    std::vector<int>
+        filter_dims_; ///< array of filter dimension that contain the size of the tensor for every dimension
+    std::vector<int>
+        output_dims_; ///< array of outpur dimension that contain the size of the tensor for every dimension
+    std::vector<int>
+        output_stride_; ///< array of output dimension that contain the stride of the tensor for every dimension
+    int algo_;          ///< enumerant that specifies which convolution algorithm should be used to compute the results
+    int array_length_;  ///< dimension of the convolution
+    std::vector<int> padA_; ///< array of convolution dimension containing the zero-padding size for each dimension.
+    std::vector<int>
+        filter_strideA_;          ///< array of convolution dimension containing the filter stride for each dimension
+    std::vector<int> dilationA_;  ///< array of dimension array_length containing the dilation factor for each dimension
+    cudnnConvolutionMode_t mode_; ///< selects between CUDNN_CONVOLUTION and CUDNN_CROSS_CORRELATION
+    bool use_tensor_op_;          ///< specify whether or not the use of tensor op is permitted in the library routines
+                                  ///< associated with a given convolution descriptor
+    cudnnDataType_t input_type_;  ///< selects the data type in which the computation will be done
+    cudnnDataType_t conv_type_;   ///< selects the data type in which the convolution will be done
+    std::string function_str_;    ///< the str representing the cudnn function with params
+
+  public:
+    void set_num_test(int num_test) { this->num_test = num_test; }
+    void set_warm_up(int warm_up) { this->warm_up = warm_up; }
+    void set_num_in_step(int num_in_step) { this->num_in_step = num_in_step; }
+    void set_random_seed(int random_seed) { this->random_seed = random_seed; }
+    void set_name(const std::string &n) { name = n; }
+    void set_input_dims(const std::vector<int> &input_dims) { input_dims_ = input_dims; }
+    void set_input_stride(const std::vector<int> &input_stride) { input_stride_ = input_stride; }
+    void set_filter_dims(const std::vector<int> &filter_dims) { filter_dims_ = filter_dims; }
+    void set_output_dims(const std::vector<int> &output_dims) { output_dims_ = output_dims; }
+    void set_output_stride(const std::vector<int> &output_stride) { output_stride_ = output_stride; }
+    void set_algo(int algo) { algo_ = algo; }
+    void set_array_length(int array_length) { array_length_ = array_length; }
+    void set_padA(const std::vector<int> &padA) { padA_ = padA; }
+    void set_filter_strideA(const std::vector<int> &filter_strideA) { filter_strideA_ = filter_strideA; }
+    void set_dilationA(const std::vector<int> &dilationA) { dilationA_ = dilationA; }
+    void set_mode(const cudnnConvolutionMode_t &mode) { mode_ = mode; }
+    void set_use_tensor_op(bool use_tensor_op) { use_tensor_op_ = use_tensor_op; }
+    void set_input_type(const cudnnDataType_t &input_type) { input_type_ = input_type; }
+    void set_conv_type(const cudnnDataType_t &conv_type) { input_type_ = conv_type; }
+    void set_function(const std::string &str) { function_str_ = str; }
+
+    std::vector<int> &get_input_dims() { return input_dims_; }
+    std::vector<int> &get_input_stride() { return input_stride_; }
+    std::vector<int> &get_filter_dims() { return filter_dims_; }
+    std::vector<int> &get_output_dims() { return output_dims_; }
+    std::vector<int> &get_output_stride() { return output_stride_; }
+    int get_algo() { return algo_; }
+    int get_array_length() { return array_length_; }
+    std::vector<int> &get_padA() { return padA_; }
+    std::vector<int> &get_filter_strideA() { return filter_strideA_; }
+    std::vector<int> &get_dilationA() { return dilationA_; }
+    cudnnConvolutionMode_t &get_mode() { return mode_; }
+    bool get_use_tensor_op() { return use_tensor_op_; }
+    cudnnDataType_t &get_input_type() { return input_type_; }
+    cudnnDataType_t &get_conv_type() { return input_type_; }
+    std::string &get_name() { return name; }
+    cudnn_function_name_enum get_e_name() { return e_name; }
+    std::string &get_function_str() { return function_str_; }
+    /**
+     * @brief Convert name string to enum name
+     * @return cudnn_function_name_enum
+     */
+    cudnn_function_name_enum name2enum() {
+        auto it = cudnn_function_name_string.find(this->name);
+        if (it != cudnn_function_name_string.end()) {
+            this->e_name = it->second;
+            return e_name;
+        } else {
+            throw "ERROR: invalid input function name";
+        }
+    }
+};
+} // namespace cudnn_test
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <chrono>
+#include <iomanip>
+#include <tuple>
+
+#include "cudnn_config.h"
+
+namespace cudnn_test {
+/**
+ * @brief Generation of cudnn functions' params and run the benchmark of this function
+ *
+ * @tparam T1 the type of TensorDescriptor
+ * @tparam T2 the type of ConvolutionDescriptor
+ */
+template <typename T1, typename T2> class CudnnFunction : public CudnnConfig {
+  protected:
+    cudnnHandle_t cudnn_handle;
+    TensorDescriptorNd<T1> x_desc_;
+    FilterDescriptorNd<T1> w_desc_;
+    ConvolutionDescriptor<T2> conv_desc_;
+    TensorDescriptorNd<T1> h_desc_;
+    size_t fwd_workspace_size_;
+    float *fwd_workspace_;
+    T1 *x, *filter, *h;
+    const float alpha_ = 1.f;
+    const float beta_ = 0.f;
+
+    /**
+     * @brief Malloc cuda memory and fill in value for data params used in the cudnn function
+     */
+    void prepare_input();
+    /**
+     * @brief Generate some params used in the cudnn function
+     */
+    void prepare_for_function();
+    /**
+     * @brief Get and set convolution algorithm and workspace size used in cudnn convolution functions
+     */
+    virtual void get_workspace_size() {}
+    /**
+     * @brief launch the kernel/function
+     */
+    virtual void kernel_entry() {}
+
+  public:
+    /**
+     * @brief Construct a new Cudnn Function object according to a CudnnConfig object, including initialization for
+     * cudnn handle and curand
+     *
+     * @param config a CudnnConfig object including configuration of the params to the cudnn function
+     */
+    CudnnFunction(CudnnConfig &config) : CudnnConfig(config) {
+        // Init cudnn handle and device
+        cudnn_handle_init(&this->cudnn_handle);
+    }
+    /**
+     * @brief Destroy the Cudnn Function object, including free cuda memory and handle of cudnn and curand
+     */
+    virtual ~CudnnFunction() {
+        // free context and memory
+        CUDA_SAFE_CALL(cudaFree(x));
+        CUDA_SAFE_CALL(cudaFree(filter));
+        CUDA_SAFE_CALL(cudaFree(h));
+        cudnn_handle_free(&this->cudnn_handle);
+    }
+    /**
+     * @brief The main procedure for cudnn function test, including warmup, function test and time measurement
+     */
+    void benchmark();
+};
+
+/**
+ * @brief Generate some params used in the cudnn function
+ */
+template <typename T1, typename T2> void CudnnFunction<T1, T2>::prepare_for_function() {
+    // Generate descriptor
+    conv_desc_ =
+        ConvolutionDescriptor<T2>(get_array_length(), get_padA(), get_filter_strideA(), get_dilationA(), get_mode());
+    x_desc_ = TensorDescriptorNd<T1>(get_input_dims(), get_input_stride());
+    w_desc_ = FilterDescriptorNd<T1>(get_filter_dims());
+    h_desc_ = TensorDescriptorNd<T1>(get_output_dims(), get_output_stride());
+
+    // Set Convolution MathType
+    cudnnMathType_t algo = get_use_tensor_op() ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+    CHECK_CUDNN_ERROR(cudnnSetConvolutionMathType(conv_desc_.desc(), algo));
+    // Set convolution algorithm and workspace size
+    this->get_workspace_size();
+    zeros<float>(&fwd_workspace_, std::vector<int>{static_cast<int>(this->fwd_workspace_size_ / sizeof(float)), 1});
+}
+/**
+ * @brief Malloc cuda memory and fill in value for data params used in the cudnn function
+ */
+template <typename T1, typename T2> void CudnnFunction<T1, T2>::prepare_input() {
+    // Allocate memory for filter data
+    rand<T1>(&filter, get_filter_dims(), random_seed);
+    // Allocate memory for input data
+    rand<T1>(&x, get_input_dims(), random_seed);
+    // Allocate memory for output data
+    rand<T1>(&h, get_output_dims(), random_seed);
+}
+/**
+ * @brief The main procedure for cudnn function test, including warmup, function test and time measurement
+ */
+template <typename T1, typename T2> void CudnnFunction<T1, T2>::benchmark() {
+    // Prepare some Prerequisites for function running
+    prepare_for_function();
+    // Allocate memory and fill with data of input and output tensor
+    prepare_input();
+
+    // Warm up
+    for (int i = 0; i < warm_up; ++i) {
+        for (int j = 0; j < num_in_step; j++) {
+            kernel_entry();
+        }
+    }
+    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+    // Prepare some varibles for time measurement
+    std::vector<float> iteration_time;
+    // Benchmark in range of steps
+    for (int i_ = 0; i_ < num_test; i_++) {
+        // Collect time within each step, including #num_in_step times function invoking
+        auto start = std::chrono::high_resolution_clock::now();
+        for (int j = 0; j < num_in_step; j++) {
+            kernel_entry();
+        }
+        CUDA_SAFE_CALL(cudaDeviceSynchronize());
+        auto end = std::chrono::high_resolution_clock::now();
+
+        // Convert step time to single function duration and update min and max duration
+        float i = static_cast<float>(std::chrono::duration<double, std::milli>(end - start).count() / num_in_step);
+        iteration_time.emplace_back(i);
+    }
+
+    // Output results
+    std::cout << "[function config]: " << this->get_function_str() << std::endl;
+    std::cout << "[raw_data]: ";
+    for (int i = 0; i < iteration_time.size(); i++) {
+        std::cout << iteration_time[i] << ",";
+    }
+    std::cout << std::endl;
+}
+} // namespace cudnn_test
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+/**
+ * @brief  Helper for parsing command line arguments and pass params to CudnnConfig
+ */
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <nlohmann/json.hpp>
+
+#include "convolution_backward_data.h"
+#include "convolution_backward_filter.h"
+#include "convolution_forward.h"
+
+using json = nlohmann::json;
+
+namespace cudnn_test {
+/**
+ * @brief Utility for storing command line arguments
+ */
+class Options {
+    char **begin;
+    char **end;
+
+    /**
+     * @brief Get the char* value of the cmd line argument
+     * @param  option           the argument in cmd
+     * @return char*
+     */
+    char *get_cmd_option(const std::string &option) {
+        char **itr = std::find(begin, end, option);
+        if (itr != end && ++itr != end) {
+            return *itr;
+        }
+        return 0;
+    }
+
+    /**
+     * @brief Get the int type value of cmd line argument
+     * @param  option           the cmd line argument
+     * @return int              the int type value of cmd line argument 'option'
+     */
+    int get_cmd_line_argument_int(const std::string &option) {
+        if (char *value = get_cmd_option(option)) {
+            return std::stoi(value);
+        }
+        return 0;
+    }
+
+    /**
+     * @brief Get the string type value of cmd line argument
+     * @param  option           the cmd line argument
+     * @return std::string      the int type value of cmd line argument 'option'
+     */
+    std::string get_cmd_line_argument_string(const std::string &option) {
+        if (char *value = get_cmd_option(option)) {
+            return std::string(value);
+        }
+        return "";
+    }
+
+  public:
+    int num_test;
+    int warm_up;
+    int num_in_step;
+    int random_seed;
+    std::string para_info_json;
+
+    /**
+     * @brief Construct a new Command Line object
+     * @param  argc
+     * @param  argv
+     */
+    Options(int argc, char *argv[]) {
+        begin = argv;
+        end = argv + argc;
+        num_test = get_cmd_line_argument_int("--num_test");
+        num_test = (num_test == 0 ? 1 : num_test);
+        warm_up = get_cmd_line_argument_int("--warm_up");
+        warm_up = (warm_up == 0 ? 1 : warm_up);
+        num_in_step = get_cmd_line_argument_int("--num_in_step");
+        num_in_step = (num_in_step == 0 ? 100 : num_in_step);
+        random_seed = get_cmd_line_argument_int("--random_seed");
+        random_seed = (random_seed == 0 ? time(NULL) : random_seed);
+        para_info_json = get_cmd_line_argument_string("--config_json");
+        para_info_json =
+            para_info_json == ""
+                ? R"({"algo":0,"arrayLength":2,"convType":0,"dilationA":[1,1],"filterStrideA":[1,1],"filterDims":[32,128,3,3],"inputDims":[32,128,14,14],"inputStride":[25088,196,14,1],"inputType":0,"mode":1, "name":"cudnnConvolutionBackwardFilter","outputDims":[32,32,14,14],"outputStride":[6272,196,14,1],"padA":[1,1],"tensorOp":false})"
+                : para_info_json;
+    }
+};
+
+/**
+ * @brief  Helper function to convert from json to CudnnConfig
+ *
+ * @param  j    json including the params of a cudnn function read from 'config_path'
+ * @param  fn   a CudnnConfig object
+ */
+void from_json(const json &j, cudnn_test::CudnnConfig &fn) {
+    auto str = j.dump();
+    std::replace(str.begin(), str.end(), '\"', ' ');
+    fn.set_function(str);
+    auto name = j.at("name").get<std::string>();
+    fn.set_name(name);
+    auto input_dims = j.at("inputDims").get<std::vector<int>>();
+    fn.set_input_dims(input_dims);
+    auto output_dims = j.at("outputDims").get<std::vector<int>>();
+    fn.set_output_dims(output_dims);
+    auto filter_dims = j.at("filterDims").get<std::vector<int>>();
+    fn.set_filter_dims(filter_dims);
+    auto input_type = j.at("inputType").get<cudnnDataType_t>();
+    fn.set_input_type(input_type);
+    auto conv_type = j.at("convType").get<cudnnDataType_t>();
+    fn.set_conv_type(conv_type);
+    auto array_length = j.at("arrayLength").get<int>();
+    fn.set_array_length(array_length);
+    auto input_stride = j.at("inputStride").get<std::vector<int>>();
+    fn.set_input_stride(input_stride);
+    auto output_stride = j.at("outputStride").get<std::vector<int>>();
+    fn.set_output_stride(output_stride);
+    auto algo = j.at("algo").get<int>();
+    fn.set_algo(algo);
+    auto padA = j.at("padA").get<std::vector<int>>();
+    fn.set_padA(padA);
+    auto filter_strideA = j.at("filterStrideA").get<std::vector<int>>();
+    fn.set_filter_strideA(filter_strideA);
+    auto dilationA = j.at("dilationA").get<std::vector<int>>();
+    fn.set_dilationA(dilationA);
+    auto mode = j.at("mode").get<cudnnConvolutionMode_t>();
+    fn.set_mode(mode);
+    auto use_tensor_op = j.at("tensorOp").get<bool>();
+    fn.set_use_tensor_op(use_tensor_op);
+    fn.name2enum();
+}
+
+/**
+ * @brief Get the cudnn function pointer of a specific child class object
+ * @param  function         base class object of a cudnnFunction, used to initialize the base part of the child class
+ * object
+ * @return cudnnFunction*  return a base cudnn function pointer of a specific child class
+ */
+template <typename T1, typename T2> CudnnFunction<T1, T2> *get_cudnn_function_pointer(CudnnConfig &function) {
+    switch (function.get_e_name()) {
+    case e_cudnnConvolutionForward:
+        return new ConvolutionForwardFunction<T1, T2>(function);
+    case e_cudnnConvolutionBackwardData:
+        return new ConvolutionBackwardDataFunction<T1, T2>(function);
+    case e_cudnnConvolutionBackwardFilter:
+        return new ConvolutionBackwardFilterFunction<T1, T2>(function);
+    default:
+        throw "invalid function name";
+    }
+}
+
+/**
+ * @brief run the entire process of benchmark according to cmd auguments
+ *
+ * first, read the para_info_json file in json array format representing multiple cudnn functions
+ * then for each cudnn function, get the pointer of the class object the specific cudnn function
+ * finally run the benchmark of each funcion
+ *
+ * @param  options  the cmd arguments of the application
+ */
+void run_benchmark(Options &options) {
+    try {
+        json function_config = json::parse(options.para_info_json);
+        // convert function params from json to CudnnConfig class
+        cudnn_test::CudnnConfig function = function_config.get<cudnn_test::CudnnConfig>();
+        function.set_num_test(options.num_test);
+        function.set_warm_up(options.warm_up);
+        function.set_num_in_step(options.num_in_step);
+        function.set_random_seed(options.random_seed);
+        if (function.get_input_type() == CUDNN_DATA_FLOAT && function.get_conv_type() == CUDNN_DATA_FLOAT) {
+            auto p_function = get_cudnn_function_pointer<float, float>(function);
+            p_function->benchmark();
+            delete p_function;
+        } else {
+            if (function.get_input_type() == CUDNN_DATA_HALF && function.get_conv_type() == CUDNN_DATA_FLOAT) {
+                auto p_function = get_cudnn_function_pointer<half, float>(function);
+                p_function->benchmark();
+                delete p_function;
+            } else {
+                if (function.get_input_type() == CUDNN_DATA_HALF && function.get_conv_type() == CUDNN_DATA_HALF) {
+                    auto p_function = get_cudnn_function_pointer<half, half>(function);
+                    p_function->benchmark();
+                    delete p_function;
+                } else {
+                    throw "invalid input and conv type";
+                }
+            }
+        }
+    } catch (std::exception &e) {
+        std::cout << "Error: " << e.what() << std::endl;
+    }
+}
+} // namespace cudnn_test
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_helper.cpp
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_helper.cpp
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+/**
+ * @brief  Cpp file for some functions related to cudnn
+ */
+
+#include <cstdlib>
+#include <numeric>
+
+#include "cudnn_function.h"
+
+namespace cudnn_test {
+/**
+ * @brief check cudnn function running status and throw error str
+ */
+void throw_cudnn_err(cudnnStatus_t result, const char *func, const char *file, int const line) {
+    if (result != CUDNN_STATUS_SUCCESS) {
+        const char *msg = cudnnGetErrorString(result);
+        std::stringstream safe_call_ss;
+        safe_call_ss << func << " failed with error"
+                     << "\nfile: " << file << "\nline: " << line << "\nmsg: " << msg;
+        throw std::runtime_error(safe_call_ss.str());
+    }
+}
+/**
+ * @brief check cudnn function running status and throw error str
+ */
+void check_cuda(cudaError_t result, const char *func, const char *file, int const line) {
+    if (result != cudaSuccess) {
+        const char *msg = cudaGetErrorString(result);
+        std::stringstream safe_call_ss;
+        safe_call_ss << func << " failed with error"
+                     << "\nfile: " << file << "\nline: " << line << "\nmsg: " << msg;
+        // Make sure we call CUDA Device Reset before exiting
+        throw std::runtime_error(safe_call_ss.str());
+    }
+}
+/**
+ * @brief Cuda context init
+ */
+void cudnn_handle_init(cudnnHandle_t *cudnn_handle) {
+    CUDA_SAFE_CALL(cudaDeviceReset());
+    CUDA_SAFE_CALL(cudaSetDevice(0));
+    // create streams/handles
+    CHECK_CUDNN_ERROR(cudnnCreate(cudnn_handle));
+}
+/**
+ * @brief Cuda context free
+ */
+void cudnn_handle_free(cudnnHandle_t *cudnn_handle) { CHECK_CUDNN_ERROR(cudnnDestroy(*cudnn_handle)); }
+/**
+ * @brief Malloc cuda memory and fill in rand value
+ * @tparam T
+ * @param  input            the pointer of input
+ * @param  dims_            the shape of input
+ * @param  random_seed      the random seed to generate random data
+ */
+template <typename T> void rand(T **input, std::vector<int> dims_, int random_seed) {
+    throw "unsupported rand data type";
+}
+template <> void rand(float **input, std::vector<int> dims_, int random_seed) {
+    int size = std::accumulate(dims_.begin(), dims_.end(), 1, std::multiplies<int>());
+    CUDA_SAFE_CALL(cudaMalloc((void **)input, sizeof(float) * size));
+    float *host_input;
+    CUDA_SAFE_CALL(cudaMallocHost(&host_input, sizeof(float) * size));
+    srand(random_seed);
+    for (int i = 0; i < size; i++) {
+        host_input[i] = (float)std::rand() / (float)(RAND_MAX);
+    }
+    // copy input data from host to device
+    CUDA_SAFE_CALL(cudaMemcpy(*input, host_input, sizeof(float) * size, cudaMemcpyHostToDevice));
+    CUDA_SAFE_CALL(cudaFreeHost(host_input));
+}
+template <> void rand(half **input, std::vector<int> dims_, int random_seed) {
+    int size = std::accumulate(dims_.begin(), dims_.end(), 1, std::multiplies<int>());
+    CUDA_SAFE_CALL(cudaMalloc((void **)input, sizeof(half) * size));
+    half *host_input;
+    CUDA_SAFE_CALL(cudaMallocHost(&host_input, sizeof(half) * size));
+    for (int i = 0; i < size; i++) {
+        host_input[i] = __float2half((float)std::rand() / (float)(RAND_MAX));
+    }
+    CUDA_SAFE_CALL(cudaMemcpy(host_input, *input, sizeof(half) * size, cudaMemcpyHostToDevice));
+    CUDA_SAFE_CALL(cudaFreeHost(host_input));
+}
+} // namespace cudnn_test
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_helper.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_helper.h
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+/**
+ * @brief  Header file for some functions related to cudnn
+ */
+
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <curand_kernel.h>
+
+namespace cudnn_test {
+/**
+ * @brief check cudnn function running status and throw error str
+ */
+void throw_cudnn_err(cudnnStatus_t result, const char *func, const char *file, int const line);
+#define CHECK_CUDNN_ERROR(x) throw_cudnn_err((x), #x, __FILE__, __LINE__)
+/**
+ * @brief check cudnn function running status and throw error str
+ */
+void check_cuda(cudaError_t result, const char *func, const char *file, int const line);
+#define CUDA_SAFE_CALL(x) check_cuda((x), #x, __FILE__, __LINE__)
+/**
+ * @brief Cuda context init
+ */
+void cudnn_handle_init(cudnnHandle_t *cudnn_handle);
+/**
+ * @brief Cuda context free
+ */
+void cudnn_handle_free(cudnnHandle_t *cudnn_handle);
+/**
+ * @brief Malloc cuda memory and fill in rand value
+ * @tparam T
+ * @param  input            the pointer of input
+ * @param  dims_            the shape of input
+ * @param  random_seed      the random seed to generate random data
+ */
+template <typename T> void rand(T **input, std::vector<int> dims_, int random_seed);
+/**
+ * @brief Malloc cuda memory and fill in zero
+ * @tparam T
+ * @param  input            the pointer of input
+ * @param  dims_            the shape of input
+ */
+template <typename T> void zeros(T **input, std::vector<int> dims_) {
+    int size = std::accumulate(dims_.begin(), dims_.end(), 1, std::multiplies<int>());
+    CUDA_SAFE_CALL(cudaMalloc((void **)input, sizeof(T) * size));
+    CUDA_SAFE_CALL(cudaMemset((void *)*input, 0, sizeof(T) * size));
+}
+/**
+ * @brief Get cudnn tensor format
+ * @tparam T
+ * @param  tensor_format  cudnnTensorFormat_t
+ */
+template <typename T> void get_tensor_format(cudnnTensorFormat_t &tensor_format) {
+    // For int8 inference, the supported format is NHWC
+    if (std::is_same<T, uint8_t>::value) {
+        tensor_format = CUDNN_TENSOR_NHWC;
+    } else {
+        tensor_format = CUDNN_TENSOR_NCHW;
+    }
+}
+/**
+ * @brief Get cudnn tensor data type
+ * @tparam T
+ * @param  type             cudnnDataType_t
+ */
+template <typename T> void get_tensor_type(cudnnDataType_t &type) {
+    if (std::is_same<T, float>::value) {
+        type = CUDNN_DATA_FLOAT;
+    } else if (std::is_same<T, half>::value) {
+        type = CUDNN_DATA_HALF;
+    }
+#if CUDNN_MAJOR >= 6
+    else if (std::is_same<T, uint8_t>::value)
+        type = CUDNN_DATA_INT8;
+#endif
+    else
+        throw("unknown type in tensorDescriptor");
+}
+
+/**
+ * @brief RAII wrapper for TensorDescriptorNd
+ * @tparam T
+ */
+template <typename T> class TensorDescriptorNd {
+    std::shared_ptr<cudnnTensorDescriptor_t> desc_;
+
+    struct TensorDescriptorNdDeleter {
+        void operator()(cudnnTensorDescriptor_t *desc) {
+            CHECK_CUDNN_ERROR(cudnnDestroyTensorDescriptor(*desc));
+            delete desc;
+        }
+    };
+
+  public:
+    TensorDescriptorNd() {}
+    TensorDescriptorNd(const std::vector<int> &dim, const std::vector<int> &stride)
+        : desc_(new cudnnTensorDescriptor_t, TensorDescriptorNdDeleter()) {
+        cudnnDataType_t type;
+        get_tensor_type<T>(type);
+
+        CHECK_CUDNN_ERROR(cudnnCreateTensorDescriptor(desc_.get()));
+        CHECK_CUDNN_ERROR(cudnnSetTensorNdDescriptor(*desc_, type, dim.size(), dim.data(), stride.data()));
+    }
+
+    cudnnTensorDescriptor_t desc() const { return *desc_; }
+};
+
+/**
+ * @brief RAII wrapper for FilterDescriptorNd
+ * @tparam T
+ */
+template <typename T> class FilterDescriptorNd {
+    std::shared_ptr<cudnnFilterDescriptor_t> desc_;
+
+    struct FilterDescriptorNdDeleter {
+        void operator()(cudnnFilterDescriptor_t *desc) {
+            CHECK_CUDNN_ERROR(cudnnDestroyFilterDescriptor(*desc));
+            delete desc;
+        }
+    };
+
+  public:
+    FilterDescriptorNd() {}
+
+    FilterDescriptorNd(const std::vector<int> &dim) : desc_(new cudnnFilterDescriptor_t, FilterDescriptorNdDeleter()) {
+        cudnnTensorFormat_t tensor_format;
+        get_tensor_format<T>(tensor_format);
+        cudnnDataType_t type;
+        get_tensor_type<T>(type);
+
+        CHECK_CUDNN_ERROR(cudnnCreateFilterDescriptor(desc_.get()));
+        CHECK_CUDNN_ERROR(cudnnSetFilterNdDescriptor(*desc_, type, tensor_format, dim.size(), &dim[0]));
+    }
+
+    cudnnFilterDescriptor_t desc() { return *desc_; }
+};
+
+/**
+ * @brief RAII wrapper for ConvolutionDescriptor
+ * @tparam T
+ */
+template <typename T> class ConvolutionDescriptor {
+    std::shared_ptr<cudnnConvolutionDescriptor_t> desc_;
+
+    struct ConvolutionDescriptorDeleter {
+        void operator()(cudnnConvolutionDescriptor_t *desc) {
+            CHECK_CUDNN_ERROR(cudnnDestroyConvolutionDescriptor(*desc));
+            delete desc;
+        }
+    };
+
+  public:
+    ConvolutionDescriptor() {}
+    ConvolutionDescriptor(int array_length, const std::vector<int> &padA, const std::vector<int> &filter_strideA,
+                          const std::vector<int> &dilationA, cudnnConvolutionMode_t mode)
+        : desc_(new cudnnConvolutionDescriptor_t, ConvolutionDescriptorDeleter()) {
+        cudnnDataType_t type;
+        get_tensor_type<T>(type);
+
+        CHECK_CUDNN_ERROR(cudnnCreateConvolutionDescriptor(desc_.get()));
+        CHECK_CUDNN_ERROR(cudnnSetConvolutionNdDescriptor(*desc_, array_length, padA.data(), filter_strideA.data(),
+                                                          dilationA.data(), mode, type));
+    }
+
+    cudnnConvolutionDescriptor_t desc() const { return *desc_; };
+};
+} // namespace cudnn_test
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_test.cpp
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_test.cpp
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+/**
+ * @brief Cudnn function benchmark will read the params from cmd, and use these params
+ * to benchmark the wall time of the cudnn functions.
+ */
+
+#include <limits>
+#include <stdexcept>
+
+#include "cudnn_function_helper.h"
+
+/**
+ * @brief Main function and entry of cudnn benchmark
+
+ * @details
+ * params list:
+ *  num_test: test step nums
+ *  warm_up: warm up step nums
+ *  num_in_step: times each step will invoke the function
+ *  random_seed: the random seed to generate data
+ *  config_json: the json string including the params of the function
+ *  functions supported:
+ *  cudnnConvolutionForward
+ *  cudnnConvolutionBackwardData
+ *  cudnnConvolutionBackwardFilter
+
+ * @param  argc
+ * @param  argv
+ * @return int
+ */
+int main(int argc, char *argv[]) {
+    try {
+        // parse arguments from cmd
+        cudnn_test::Options options(argc, argv);
+        // benchmark the function
+        cudnn_test::run_benchmark(options);
+    } catch (std::exception &e) {
+        std::cout << "Error: " << e.what() << std::endl;
+        exit(-1);
+    }
+    return 0;
+}