profile_batched_gemm_reduce_impl.hpp 16.4 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

4
5
#pragma once

Chao Liu's avatar
Chao Liu committed
6
7
8
#include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
9
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
Chao Liu's avatar
Chao Liu committed
10
11
12
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

#include "ck/library/utility/check_err.hpp"
13
14
15
16
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
Chao Liu's avatar
Chao Liu committed
17
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
18
19
20
21

namespace ck {
namespace tensor_operation {
namespace device {
22
namespace instance {
23

24
25
26
27
28
29
30
31
32
33
using F32                 = float;
using F16                 = ck::half_t;
using ReducePtrsGlobal    = ck::Tuple<F32*, F32*>;
using Identity            = ck::tensor_operation::element_wise::PassThrough;
using Square              = ck::tensor_operation::element_wise::UnarySquare;
using ReduceInElementOps  = ck::Tuple<Identity, Square>;
using ReduceOutElementOps = ck::Tuple<Identity, Identity>;

using DeviceGemmReduceNoOpPtr =
    ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
34
35

void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
36
    std::vector<DeviceGemmReduceNoOpPtr>&);
37
38

void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
39
    std::vector<DeviceGemmReduceNoOpPtr>&);
40
41

void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
42
    std::vector<DeviceGemmReduceNoOpPtr>&);
43
44

void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
45
    std::vector<DeviceGemmReduceNoOpPtr>&);
46

47
} // namespace instance
48
49
50
51
52
53
54
55
56
57
} // namespace device
} // namespace tensor_operation
} // namespace ck

namespace ck {
namespace profiler {

template <typename ADataType,
          typename BDataType,
          typename CDataType,
58
          typename ReduceDataType,
59
60
61
62
63
64
          typename ALayout,
          typename BLayout,
          typename CLayout>
bool profile_batched_gemm_reduce_impl(int do_verification,
                                      int init_method,
                                      bool do_log,
JD's avatar
JD committed
65
                                      bool time_kernel,
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
                                      int M,
                                      int N,
                                      int K,
                                      int StrideA,
                                      int StrideB,
                                      int StrideC,
                                      int BatchCount)
{
    bool pass = true;

    auto f_host_tensor_descriptor = [](std::size_t batch_count,
                                       std::size_t row,
                                       std::size_t col,
                                       std::size_t stride,
                                       auto layout) {
        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
                                        std::vector<std::size_t>({row * stride, stride, 1}));
        }
        else
        {
            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
                                        std::vector<std::size_t>({col * stride, 1, stride}));
        }
    };

    Tensor<ADataType> a_g_m_k(f_host_tensor_descriptor(BatchCount, M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_g_k_n(f_host_tensor_descriptor(BatchCount, K, N, StrideB, BLayout{}));

    Tensor<CDataType> c_g_m_n_host_result(
        f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
98
    Tensor<ReduceDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
99
        {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
100
    Tensor<ReduceDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
101
102
103
104
        {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));

    Tensor<CDataType> c_g_m_n_device_result(
        f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
105
    Tensor<ReduceDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
106
        {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
107
    Tensor<ReduceDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
        {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));

    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
    std::cout << "d0_g_m: " << d0_g_m_host_result.mDesc << std::endl;
    std::cout << "d1_g_m: " << d1_g_m_host_result.mDesc << std::endl;

    std::size_t num_thread = std::thread::hardware_concurrency();
    switch(init_method)
    {
    case 0: break;
    case 1:
        std::srand(0);
        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
        break;
    default:
        std::srand(0);
        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
    }

131
132
133
    using AElementOp            = ck::tensor_operation::element_wise::PassThrough;
    using BElementOp            = ck::tensor_operation::element_wise::PassThrough;
    using CElementOp            = ck::tensor_operation::element_wise::PassThrough;
134
135
    using ReduceOp0             = ck::reduce::Add;
    using ReduceOp1             = ck::reduce::Add;
136
137
    using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
    using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
rocking5566's avatar
rocking5566 committed
138

139
140
141
142
143
144
145
146
147
148
149
150
    auto a_element_op                     = AElementOp{};
    auto b_element_op                     = BElementOp{};
    auto c_element_op                     = CElementOp{};
    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};

    const auto reduce0_op = ReduceOp0{};
    const auto reduce1_op = ReduceOp1{};

    auto passthrough                            = UnaryIdenticElementOp{};
    auto square                                 = UnarySquareElementOp{};
    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
    std::array<void*, 2> reduce_out_element_ops = {&passthrough, &passthrough};
151
152
153
154
155
156
157

    if(do_verification)
    {
        using ReferenceBatchedGemmInstance =
            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
                                                             BDataType,
                                                             CDataType,
Anthony Chang's avatar
Anthony Chang committed
158
                                                             float,
159
160
161
162
                                                             AElementOp,
                                                             BElementOp,
                                                             CElementOp>;

163
164
        using ReduceAccDataType = ReduceDataType;

165
166
167
168
169
170
171
172
173
174
175
176
        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
        auto ref_invoker      = ref_batched_gemm.MakeInvoker();

        auto ref_argument = ref_batched_gemm.MakeArgument(
            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);

        ref_invoker.Run(ref_argument);

        for(int batch = 0; batch < BatchCount; ++batch)
        {
            for(int m = 0; m < M; ++m)
            {
177
178
                auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
                auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
179
180
181

                for(int n = 0; n < N; ++n)
                {
182
183
184
                    ReduceAccDataType d0_val =
                        ck::type_convert<ReduceAccDataType>(c_g_m_n_host_result(batch, m, n));
                    ReduceAccDataType d1_val;
185

186
187
188
                    square(d1_val, d0_val);
                    reduce0_op(reduce0_acc, d0_val);
                    reduce1_op(reduce1_acc, d1_val);
189
190
                }

191
192
                d0_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce0_acc);
                d1_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce1_acc);
193
194
195
196
            }
        }
    }

197
198
199
    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
    DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpaceSize());
200
    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
201
                                 d0_g_m_device_result.mDesc.GetElementSpaceSize());
202
    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
203
                                 d1_g_m_device_result.mDesc.GetElementSpaceSize());
204

205
206
    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
                                      reduce1_device_buf.GetDeviceBuffer()};
rocking5566's avatar
rocking5566 committed
207

208
209
210
211
    a_device_buf.ToDevice(a_g_m_k.mData.data());
    b_device_buf.ToDevice(b_g_k_n.mData.data());

    // add device GEMM instances
212
    std::vector<ck::tensor_operation::device::instance::DeviceGemmReduceNoOpPtr> gemm_ptrs;
213
214
215
216
217
218
219
220

    if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
                 is_same<CDataType, half_t>::value)
    {
        if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
                     is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
                     is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
        {
221
            ck::tensor_operation::device::instance::
222
223
224
225
226
227
228
                add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
                    gemm_ptrs);
        }
        else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
        {
229
            ck::tensor_operation::device::instance::
230
231
232
233
234
235
236
                add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
                    gemm_ptrs);
        }
        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
                          is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
        {
237
            ck::tensor_operation::device::instance::
238
239
240
241
242
243
244
                add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
                    gemm_ptrs);
        }
        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
        {
245
            ck::tensor_operation::device::instance::
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
                add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
                    gemm_ptrs);
        }
    }

    if(gemm_ptrs.size() <= 0)
    {
        throw std::runtime_error("wrong! no device GEMM instance found");
    }

    std::string best_gemm_name;
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;

    // profile device GEMM instances
    for(auto& gemm_ptr : gemm_ptrs)
    {
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
        auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
                                                          b_device_buf.GetDeviceBuffer(),
                                                          nullptr,
                                                          {},
                                                          c_device_buf.GetDeviceBuffer(),
                                                          p_reduces,
                                                          M,
                                                          N,
                                                          K,
                                                          StrideA,
                                                          StrideB,
                                                          StrideC,
                                                          {},
                                                          gemm_element_ops,
                                                          {},
                                                          reduce_in_element_ops,
                                                          reduce_out_element_ops,
                                                          BatchCount);
282
283
284
285
286

        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();

        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
JD's avatar
JD committed
287
            // init DO, D1 to 0
288
289
            reduce0_device_buf.SetZero();
            reduce1_device_buf.SetZero();
290

JD's avatar
JD committed
291
292
            float ave_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318

            std::string gemm_name = gemm_ptr->GetTypeString();

            std::size_t flop      = std::size_t(2) * BatchCount * M * N * K;
            std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
                                    sizeof(BDataType) * BatchCount * K * N +
                                    sizeof(CDataType) * BatchCount * M * N;

            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

            float gb_per_sec = num_btype / 1.E6 / ave_time;

            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
                      << " GB/s, " << gemm_name << std::endl;

            if(tflops > best_tflops)
            {
                best_gemm_name  = gemm_name;
                best_tflops     = tflops;
                best_ave_time   = ave_time;
                best_gb_per_sec = gb_per_sec;
            }

            if(do_verification)
            {
                c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
319
320
                reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
                reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
321

322
                bool c_error =
323
                    ck::utils::check_err(c_g_m_n_device_result.mData, c_g_m_n_host_result.mData);
324
                bool d0_error =
325
                    ck::utils::check_err(d0_g_m_device_result.mData, d0_g_m_host_result.mData);
326
                bool d1_error =
327
                    ck::utils::check_err(d1_g_m_device_result.mData, d1_g_m_host_result.mData);
328
329
330
331

                pass = pass && (c_error == true);
                pass = pass && (d0_error == true);
                pass = pass && (d1_error == true);
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368

                if(do_log)
                {
                    LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
                    LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
                    LogRangeAsType<float>(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
                        << std::endl;
                    LogRangeAsType<float>(
                        std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
                        << std::endl;
                    LogRangeAsType<float>(std::cout << "d0_host: ", d0_g_m_host_result.mData, ",")
                        << std::endl;
                    LogRangeAsType<float>(
                        std::cout << "d0_device: ", d0_g_m_device_result.mData, ",")
                        << std::endl;
                    LogRangeAsType<float>(std::cout << "d1_host: ", d1_g_m_host_result.mData, ",")
                        << std::endl;
                    LogRangeAsType<float>(
                        std::cout << "d1_device: ", d1_g_m_device_result.mData, ",")
                        << std::endl;
                }
            }
        }
        else
        {
            std::cout << "does not support this GEMM problem" << std::endl;
        }
    }

    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;

    return pass;
}

} // namespace profiler
} // namespace ck