profile_conv_bwd_weight_impl.hpp 13.6 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

4
#pragma once
JD's avatar
JD committed
5

Chao Liu's avatar
Chao Liu committed
6
7
8
9
10
#include "ck/ck.hpp"
#include <iomanip>
#include <iostream>
#include <typeinfo>

Chao Liu's avatar
Chao Liu committed
11
12
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
Chao Liu's avatar
Chao Liu committed
13
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
Chao Liu's avatar
Chao Liu committed
14
15
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

Chao Liu's avatar
Chao Liu committed
16
17
#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp"

Chao Liu's avatar
Chao Liu committed
18
#include "ck/library/utility/check_err.hpp"
Chao Liu's avatar
Chao Liu committed
19
20
21
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
Chao Liu's avatar
Chao Liu committed
22
#include "ck/library/utility/convolution_parameter.hpp"
Chao Liu's avatar
Chao Liu committed
23
#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
24
25
26
27

namespace ck {
namespace profiler {

28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
template <typename DataType>
void show_data_nhwc_layout(Tensor<DataType>& nhwc)
{
    std::cout << "[";
    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
    {
        std::cout << "[";
        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
        {
            std::cout << "[";
            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
            {
                std::cout << "[";
                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
                {
                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
                }
                std::cout << "]";
            }
            std::cout << "]";
        }
        std::cout << "]";
    }
    std::cout << "]";
}

template <ck::index_t NDimSpatial,
55
56
          typename InLayout,
          typename WeiLayout,
57
58
59
60
          typename OutLayout,
          typename InDataType,
          typename WeiDataType,
          typename OutDataType>
61
62
63
bool profile_conv_bwd_weight_impl(int do_verification,
                                  int init_method,
                                  bool do_log,
JD's avatar
JD committed
64
                                  bool time_kernel,
65
                                  const ck::tensor_operation::device::ConvParams& params,
66
67
                                  ck::index_t split_k)
{
68
69
70
71
72
73
74
75
76
77
    // make host tensor descritpor
    auto f_nhwc_host_tensor_descriptor =
        [](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
            std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
                                                  static_cast<std::size_t>(c)};
            nhwc_lengths.insert(
                nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());

            return HostTensorDescriptor(nhwc_lengths);
        };
78

79
80
81
82
83
    auto f_nchw_host_tensor_descriptor =
        [](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
            std::vector<std::size_t> nchw_lengths{static_cast<std::size_t>(n),
                                                  static_cast<std::size_t>(c)};
            nchw_lengths.insert(nchw_lengths.end(), spatial_lengths.begin(), spatial_lengths.end());
84

85
            return HostTensorDescriptor(nchw_lengths);
86
87
        };

88
    HostTensorDescriptor in_desc, wei_desc, out_desc;
89

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
    // FIXME: properly implement "make host descriptor" for different layout
    if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NWC> ||
                 is_same_v<InLayout, ck::tensor_layout::convolution::NHWC> ||
                 is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>)
    {
        in_desc =
            f_nhwc_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_);
    }
    else if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NCW> ||
                      is_same_v<InLayout, ck::tensor_layout::convolution::NCHW> ||
                      is_same_v<InLayout, ck::tensor_layout::convolution::NCDHW>)
    {
        in_desc =
            f_nchw_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_);
    }

    // FIXME: properly implement "make host descriptor" for different layout
    if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
                 is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
                 is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
    {
        wei_desc =
            f_nhwc_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_);
    }
    else if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KCX> ||
                      is_same_v<WeiLayout, ck::tensor_layout::convolution::KCYX> ||
                      is_same_v<WeiLayout, ck::tensor_layout::convolution::KCZYX>)
    {
        wei_desc =
            f_nchw_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_);
    }

    // FIXME: properly implement "make host descriptor" for different layout
    if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NWK> ||
                 is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK> ||
                 is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
    {
        out_desc =
            f_nhwc_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths());
    }
    else if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NKW> ||
                      is_same_v<OutLayout, ck::tensor_layout::convolution::NKHW> ||
                      is_same_v<OutLayout, ck::tensor_layout::convolution::NKDHW>)
    {
        out_desc =
            f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths());
    }

    Tensor<InDataType> input(in_desc);
    Tensor<WeiDataType> weight_host_result(wei_desc);
    Tensor<WeiDataType> weight_device_result(wei_desc);
    Tensor<OutDataType> output(out_desc);

    std::cout << "input: " << input.mDesc << std::endl;
    std::cout << "weight: " << weight_host_result.mDesc << std::endl;
    std::cout << "output: " << output.mDesc << std::endl;
146
147
148
149
150

    switch(init_method)
    {
    case 0: break;
    case 1:
151
152
        input.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
        output.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
153
154
        break;
    default:
155
156
        input.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
        output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
157
158
159
160
161
162
163
164
165
166
    }

    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;

    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};

167
168
169
170
171
172
173
    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_device_result.mDesc.GetElementSpace());
    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());

    in_device_buf.ToDevice(input.mData.data());
    out_device_buf.ToDevice(output.mData.data());

174
175
    if(do_verification)
    {
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
                                                                           InLayout,
                                                                           WeiLayout,
                                                                           OutLayout,
                                                                           InDataType,
                                                                           WeiDataType,
                                                                           OutDataType,
                                                                           InElementOp,
                                                                           WeiElementOp,
                                                                           OutElementOp>{};

        auto ref_invoker = ref_conv.MakeInvoker();

        auto ref_argument = ref_conv.MakeArgument(input,
                                                  weight_host_result,
                                                  output,
                                                  params.conv_filter_strides_,
                                                  params.conv_filter_dilations_,
                                                  params.input_left_pads_,
                                                  params.input_right_pads_,
196
197
198
199
200
201
202
                                                  in_element_op,
                                                  wei_element_op,
                                                  out_element_op);

        ref_invoker.Run(ref_argument);
    }

Chao Liu's avatar
Chao Liu committed
203
204
205
206
207
208
209
210
211
212
    using DeviceOp = ck::tensor_operation::device::DeviceConvBwdWeight<NDimSpatial,
                                                                       InLayout,
                                                                       WeiLayout,
                                                                       OutLayout,
                                                                       InDataType,
                                                                       WeiDataType,
                                                                       OutDataType,
                                                                       InElementOp,
                                                                       WeiElementOp,
                                                                       OutElementOp>;
213

Chao Liu's avatar
Chao Liu committed
214
215
216
217
218
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();

    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
219

220
221
    std::string best_op_name;
    float best_avg_time   = 0;
222
223
224
225
226
    float best_tflops     = 0;
    float best_gb_per_sec = 0;

    // profile device Conv instances
    bool pass = true;
JD's avatar
JD committed
227

228
    for(auto& op_ptr : op_ptrs)
229
    {
230
        // using atomic, so need to reset input, setzero is done in invoker
Chao Liu's avatar
Chao Liu committed
231
        wei_device_buf.SetZero();
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251

        auto argument_ptr =
            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
                                        params.N_,
                                        params.K_,
                                        params.C_,
                                        params.input_spatial_lengths_,
                                        params.filter_spatial_lengths_,
                                        params.output_spatial_lengths_,
                                        params.conv_filter_strides_,
                                        params.conv_filter_dilations_,
                                        params.input_left_pads_,
                                        params.input_right_pads_,
                                        in_element_op,
                                        wei_element_op,
                                        out_element_op,
                                        split_k);

Chao Liu's avatar
Chao Liu committed
252
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
253
        {
Chao Liu's avatar
Chao Liu committed
254
            std::string op_name = op_ptr->GetTypeString();
JD's avatar
JD committed
255

Chao Liu's avatar
Chao Liu committed
256
            auto invoker_ptr = op_ptr->MakeInvokerPointer();
257

Chao Liu's avatar
Chao Liu committed
258
259
            float avg_time =
                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
260

Chao Liu's avatar
Chao Liu committed
261
262
            std::size_t flop      = params.GetFlops();
            std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
263

Chao Liu's avatar
Chao Liu committed
264
265
            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_btype / 1.E6 / avg_time;
266

Chao Liu's avatar
Chao Liu committed
267
268
            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
                      << gb_per_sec << " GB/s, " << op_name << std::endl;
269

Chao Liu's avatar
Chao Liu committed
270
271
272
273
274
275
276
            if(tflops > best_tflops)
            {
                best_op_name    = op_name;
                best_tflops     = tflops;
                best_avg_time   = avg_time;
                best_gb_per_sec = gb_per_sec;
            }
277

Chao Liu's avatar
Chao Liu committed
278
279
280
            if(do_verification)
            {
                wei_device_buf.FromDevice(weight_device_result.mData.data());
281

Chao Liu's avatar
Chao Liu committed
282
283
                pass = pass &
                       ck::utils::check_err(weight_device_result.mData, weight_host_result.mData);
284

Chao Liu's avatar
Chao Liu committed
285
286
287
288
289
                if(do_log)
                {
                    std::cout << "in : ";
                    show_data_nhwc_layout(output);
                    std::cout << std::endl;
290

Chao Liu's avatar
Chao Liu committed
291
292
293
                    std::cout << "wei: ";
                    show_data_nhwc_layout(weight_host_result);
                    std::cout << std::endl;
JD's avatar
JD committed
294

Chao Liu's avatar
Chao Liu committed
295
296
297
                    std::cout << "out  : ";
                    show_data_nhwc_layout(input);
                    std::cout << std::endl;
298

Chao Liu's avatar
Chao Liu committed
299
300
301
302
                    std::cout << "wei_device: ";
                    show_data_nhwc_layout(weight_device_result);
                    std::cout << std::endl;
                }
303
304
            }
        }
Chao Liu's avatar
Chao Liu committed
305
306
307
308
        else
        {
            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
        }
309
310
    }

Chao Liu's avatar
Chao Liu committed
311
312
313
    std::cout << "Best configuration parameters:"
              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
314
315
316
317
318
319

    return pass;
}

} // namespace profiler
} // namespace ck