profile_gemm_impl.hpp 6.71 KB
Newer Older
1
2
#pragma once
#include "device_gemm_instance.hpp"
ltqin's avatar
ltqin committed
3
#include "device_gemm_xdl_instance.hpp"
4
5
6
7
8
9
10
11
12
13

namespace ck {
namespace profiler {

template <typename ADataType,
          typename BDataType,
          typename CDataType,
          typename ALayout,
          typename BLayout,
          typename CLayout>
Chao Liu's avatar
Chao Liu committed
14
15
16
17
18
19
20
21
22
23
void profile_gemm_impl(int do_verification,
                       int init_method,
                       bool do_log,
                       int nrepeat,
                       int M,
                       int N,
                       int K,
                       int StrideA,
                       int StrideB,
                       int StrideC)
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
{
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
                                            std::vector<std::size_t>({1, stride}));
            }
        };

    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;

ltqin's avatar
ltqin committed
48
    std::size_t num_thread = std::thread::hardware_concurrency();
49
50
51
52
    switch(init_method)
    {
    case 0: break;
    case 1:
ltqin's avatar
ltqin committed
53
54
        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
55
56
        break;
    default:
ltqin's avatar
ltqin committed
57
58
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
59
    }
ltqin's avatar
ltqin committed
60
61
    // set zero to c_device_buf
    c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
62
63
64

    if(do_verification)
    {
Chao Liu's avatar
Chao Liu committed
65
66
67
68
69
70
        host_gemm_mk_kn_mn(a_m_k,
                           b_k_n,
                           c_m_n_host_result,
                           ck::tensor_operation::element_wise::PassThrough{},
                           ck::tensor_operation::element_wise::PassThrough{},
                           ck::tensor_operation::element_wise::PassThrough{});
71
72
73
74
75
76
77
78
79
80
81
    }

    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());

    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());
    c_device_buf.ToDevice(c_m_n_device_result.mData.data());

    // add device GEMM instances
Chao Liu's avatar
Chao Liu committed
82
    std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceGemmNoOpPtr> gemm_ptrs;
83
84
85
86
87
88
89
90
91
92

    ck::tensor_operation::device::device_gemm_instance::
        add_device_gemm_instance<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>(
            gemm_ptrs);

    if(gemm_ptrs.size() <= 0)
    {
        throw std::runtime_error("wrong! no device GEMM instance found");
    }

Chao Liu's avatar
Chao Liu committed
93
    std::string best_gemm_name;
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;

    // profile device GEMM instances
    for(auto& gemm_ptr : gemm_ptrs)
    {
        auto argument_ptr =
            gemm_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
                                          static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
                                          static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
                                          M,
                                          N,
                                          K,
                                          StrideA,
                                          StrideB,
Chao Liu's avatar
Chao Liu committed
110
111
112
113
                                          StrideC,
                                          ck::tensor_operation::element_wise::PassThrough{},
                                          ck::tensor_operation::element_wise::PassThrough{},
                                          ck::tensor_operation::element_wise::PassThrough{});
114
115
116
117
118

        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();

        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
Chao Liu's avatar
Chao Liu committed
119
120
            std::string gemm_name = gemm_ptr->GetTypeString();

121
122
123
            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);

            std::size_t flop = std::size_t(2) * M * N * K;
Chao Liu's avatar
Chao Liu committed
124

125
126
127
128
129
130
131
132
            std::size_t num_btype =
                sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + sizeof(CDataType) * M * N;

            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

            float gb_per_sec = num_btype / 1.E6 / ave_time;

            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
Chao Liu's avatar
Chao Liu committed
133
                      << " GB/s, " << gemm_name << std::endl;
134
135
136

            if(tflops > best_tflops)
            {
Chao Liu's avatar
Chao Liu committed
137
                best_gemm_name  = gemm_name;
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
                best_tflops     = tflops;
                best_ave_time   = ave_time;
                best_gb_per_sec = gb_per_sec;
            }

            if(do_verification)
            {
                c_device_buf.FromDevice(c_m_n_device_result.mData.data());

                check_error(c_m_n_host_result, c_m_n_device_result);

                if(do_log)
                {
                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
                        << std::endl;
                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
                        << std::endl;
                }
            }
        }
        else
        {
            std::cout << "this device GEMM instance does not support this GEMM problem"
                      << std::endl;
        }
    }

    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
Chao Liu's avatar
Chao Liu committed
168
              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
169
170
171
172
}

} // namespace profiler
} // namespace ck