cudnn_function.h 5.14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
// Copyright(c) Microsoft Corporation.
// Licensed under the MIT License.

#pragma once

#include <chrono>
#include <iomanip>
#include <tuple>

#include "cudnn_config.h"

namespace cudnn_test {
/**
 * @brief Generation of cudnn functions' params and run the benchmark of this function
 *
 * @tparam T1 the type of TensorDescriptor
 * @tparam T2 the type of ConvolutionDescriptor
 */
template <typename T1, typename T2> class CudnnFunction : public CudnnConfig {
  protected:
    cudnnHandle_t cudnn_handle;
    TensorDescriptorNd<T1> x_desc_;
    FilterDescriptorNd<T1> w_desc_;
    ConvolutionDescriptor<T2> conv_desc_;
    TensorDescriptorNd<T1> h_desc_;
    size_t fwd_workspace_size_;
    float *fwd_workspace_;
    T1 *x, *filter, *h;
    const float alpha_ = 1.f;
    const float beta_ = 0.f;

    /**
     * @brief Malloc cuda memory and fill in value for data params used in the cudnn function
     */
    void prepare_input();
    /**
     * @brief Generate some params used in the cudnn function
     */
    void prepare_for_function();
    /**
     * @brief Get and set convolution algorithm and workspace size used in cudnn convolution functions
     */
    virtual void get_workspace_size() {}
    /**
     * @brief launch the kernel/function
     */
    virtual void kernel_entry() {}

  public:
    /**
     * @brief Construct a new Cudnn Function object according to a CudnnConfig object, including initialization for
     * cudnn handle and curand
     *
     * @param config a CudnnConfig object including configuration of the params to the cudnn function
     */
    CudnnFunction(CudnnConfig &config) : CudnnConfig(config) {
        // Init cudnn handle and device
        cudnn_handle_init(&this->cudnn_handle);
    }
    /**
     * @brief Destroy the Cudnn Function object, including free cuda memory and handle of cudnn and curand
     */
    virtual ~CudnnFunction() {
        // free context and memory
        CUDA_SAFE_CALL(cudaFree(x));
        CUDA_SAFE_CALL(cudaFree(filter));
        CUDA_SAFE_CALL(cudaFree(h));
        cudnn_handle_free(&this->cudnn_handle);
    }
    /**
     * @brief The main procedure for cudnn function test, including warmup, function test and time measurement
     */
    void benchmark();
};

/**
 * @brief Generate some params used in the cudnn function
 */
template <typename T1, typename T2> void CudnnFunction<T1, T2>::prepare_for_function() {
    // Generate descriptor
    conv_desc_ =
        ConvolutionDescriptor<T2>(get_array_length(), get_padA(), get_filter_strideA(), get_dilationA(), get_mode());
    x_desc_ = TensorDescriptorNd<T1>(get_input_dims(), get_input_stride());
    w_desc_ = FilterDescriptorNd<T1>(get_filter_dims());
    h_desc_ = TensorDescriptorNd<T1>(get_output_dims(), get_output_stride());

    // Set Convolution MathType
    cudnnMathType_t algo = get_use_tensor_op() ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
    CHECK_CUDNN_ERROR(cudnnSetConvolutionMathType(conv_desc_.desc(), algo));
    // Set convolution algorithm and workspace size
    this->get_workspace_size();
    zeros<float>(&fwd_workspace_, std::vector<int>{static_cast<int>(this->fwd_workspace_size_ / sizeof(float)), 1});
}
/**
 * @brief Malloc cuda memory and fill in value for data params used in the cudnn function
 */
template <typename T1, typename T2> void CudnnFunction<T1, T2>::prepare_input() {
    // Allocate memory for filter data
    rand<T1>(&filter, get_filter_dims(), random_seed);
    // Allocate memory for input data
    rand<T1>(&x, get_input_dims(), random_seed);
    // Allocate memory for output data
    rand<T1>(&h, get_output_dims(), random_seed);
}
/**
 * @brief The main procedure for cudnn function test, including warmup, function test and time measurement
 */
template <typename T1, typename T2> void CudnnFunction<T1, T2>::benchmark() {
    // Prepare some Prerequisites for function running
    prepare_for_function();
    // Allocate memory and fill with data of input and output tensor
    prepare_input();

    // Warm up
    for (int i = 0; i < warm_up; ++i) {
        for (int j = 0; j < num_in_step; j++) {
            kernel_entry();
        }
    }
    CUDA_SAFE_CALL(cudaDeviceSynchronize());

    // Prepare some varibles for time measurement
    std::vector<float> iteration_time;
    // Benchmark in range of steps
    for (int i_ = 0; i_ < num_test; i_++) {
        // Collect time within each step, including #num_in_step times function invoking
        auto start = std::chrono::high_resolution_clock::now();
        for (int j = 0; j < num_in_step; j++) {
            kernel_entry();
        }
        CUDA_SAFE_CALL(cudaDeviceSynchronize());
        auto end = std::chrono::high_resolution_clock::now();

        // Convert step time to single function duration and update min and max duration
        float i = static_cast<float>(std::chrono::duration<double, std::milli>(end - start).count() / num_in_step);
        iteration_time.emplace_back(i);
    }

    // Output results
    std::cout << "[function config]: " << this->get_function_str() << std::endl;
    std::cout << "[raw_data]: ";
    for (int i = 0; i < iteration_time.size(); i++) {
        std::cout << iteration_time[i] << ",";
    }
    std::cout << std::endl;
}
} // namespace cudnn_test