pool2d_fwd_common.hpp 8.96 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
// SPDX-License-Identifier: MIT
Illia Silin's avatar
Illia Silin committed
2
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
Chao Liu's avatar
Chao Liu committed
3

Qianfeng's avatar
Qianfeng committed
4
5
#pragma once

6
#include <iostream>
7

Chao Liu's avatar
Chao Liu committed
8
9
10
11
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
12
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_impl.hpp"
Chao Liu's avatar
Chao Liu committed
13
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
14

Chao Liu's avatar
Chao Liu committed
15
#include "ck/library/utility/check_err.hpp"
16
17
18
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
19
#include "ck/library/utility/literals.hpp"
rocking's avatar
rocking committed
20
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
21
22
23

template <typename InDataType,
          typename OutDataType,
rocking's avatar
rocking committed
24
          typename ComputeDataType,
Qianfeng's avatar
Qianfeng committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
          typename IndexDataType,
          typename InLayout,
          typename OutLayout,
          ck::ReduceTensorOp ReduceOpId,
          bool PropagateNan,
          bool OutputIndex>
bool pool_test(bool do_verification,
               int init_method,
               bool time_kernel,
               ck::index_t N,
               ck::index_t C,
               ck::index_t Y,
               ck::index_t X,
               ck::index_t Hi,
               ck::index_t Wi,
               ck::index_t window_stride_h,
               ck::index_t window_stride_w,
rocking's avatar
rocking committed
42
43
               ck::index_t window_dilation_h,
               ck::index_t window_dilation_w,
Qianfeng's avatar
Qianfeng committed
44
45
46
47
               ck::index_t in_left_pad_h,
               ck::index_t in_left_pad_w,
               ck::index_t in_right_pad_h,
               ck::index_t in_right_pad_w)
48
{
Qianfeng's avatar
Qianfeng committed
49
    using DevicePoolFwdInstance =
50
51
52
53
54
55
56
57
58
59
60
61
62
        ck::tensor_operation::device::DevicePool2dFwdImpl<InDataType,      // InDataType
                                                          OutDataType,     // OutDataType
                                                          IndexDataType,   // IndexDataType
                                                          ComputeDataType, // ComputeDataType
                                                          ReduceOpId,
                                                          OutputIndex,
                                                          64,     // BlockSize
                                                          64,     // ReduceMThreadClusterSize
                                                          1,      // ReduceKThreadClusterSize
                                                          4,      // ReduceMThreadSliceSize
                                                          1,      // ReduceKThreadSliceSize
                                                          1,      // InSrcOutDstVectorSize
                                                          false>; // IsFastestDimReduced
63

rocking's avatar
rocking committed
64
65
66
67
    const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
    const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
68

rocking's avatar
rocking committed
69
70
    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
rocking's avatar
rocking committed
71
    const std::vector<ck::index_t> window_dilations{window_dilation_h, window_dilation_w};
rocking's avatar
rocking committed
72
73
    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
74
75
76
77

    // tensor layout
    auto f_host_tensor_descriptor =
        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
78
79
            using namespace ck::literals;

80
81
            if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
            {
82
                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
83
84
85
86
            }
            else if constexpr(ck::is_same<decltype(layout),
                                          ck::tensor_layout::convolution::NHWC>::value)
            {
87
                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
88
89
90
91
92
            }
        };

    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
93
94
    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(
        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
95
    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
96
97
    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(
        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
98
99
100
101
102
103
104

    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
    std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;

    switch(init_method)
    {
    case 0: break;
105
106
107
    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); break;
    case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
108
109
    }

110
111
112
    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
    DeviceMem out_device_buf(sizeof(OutDataType) *
                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
113
    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
114
                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
115
116
117

    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());

118
119
120
121
122
123
    auto pool         = DevicePoolFwdInstance{};
    auto invoker_ptr  = pool.MakeInvokerPointer();
    auto argument_ptr = pool.MakeArgumentPointer(
        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
        static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
rocking's avatar
rocking committed
124
125
126
127
128
129
        {N, C, Hi, Wi},
        {Y, X},
        {N, C, Ho, Wo},
        {C * Hi * Wi, 1, Wi * C, C},
        {C * Ho * Wo, 1, Wo * C, C},
        {C * Ho * Wo, 1, Wo * C, C},
130
        window_strides,
rocking's avatar
rocking committed
131
        window_dilations,
132
        input_left_pads,
rocking's avatar
rocking committed
133
134
        input_right_pads,
        {2, 3});
135
136
137
138
139
140
141

    if(!pool.IsSupportedArgument(argument_ptr.get()))
    {
        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
                                 "not support this problem");
    }

JD's avatar
JD committed
142
    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
143
144
145
146
147
148
149
150
151
152
153
154
155

    std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;

    std::size_t num_btype =
        sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(OutDataType) * (N * C * Ho * Wo);

    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

    float gb_per_sec = num_btype / 1.E6 / ave_time;

    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;

Anthony Chang's avatar
Anthony Chang committed
156
    bool pass = true;
157

158
159
    if(do_verification)
    {
rocking's avatar
rocking committed
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
        using ReferencePoolingFwdInstance =
            ck::tensor_operation::host::ReferencePoolingFwd<4,
                                                            2,
                                                            InDataType,
                                                            OutDataType,
                                                            ComputeDataType,
                                                            IndexDataType,
                                                            ReduceOpId,
                                                            PropagateNan,
                                                            OutputIndex>;

        auto ref_pooling          = ReferencePoolingFwdInstance{};
        auto ref_pooling_invoker  = ref_pooling.MakeInvoker();
        auto ref_pooling_argument = ref_pooling.MakeArgument(in_n_c_hi_wi,
                                                             out_n_c_ho_wo_host,
                                                             out_indices_n_c_ho_wo_host,
                                                             window_spatial_lengths,
                                                             window_strides,
rocking's avatar
rocking committed
178
                                                             window_dilations,
rocking's avatar
rocking committed
179
180
181
182
                                                             input_left_pads,
                                                             input_right_pads);

        ref_pooling_invoker.Run(ref_pooling_argument);
183
184
185

        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());

186
        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
187

188
        if constexpr(OutputIndex)
189
190
191
        {
            out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());

192
193
            pass = pass &&
                   ck::utils::check_err(out_indices_n_c_ho_wo_device, out_indices_n_c_ho_wo_host);
194
195
        };
    }
196

Qianfeng's avatar
Qianfeng committed
197
198
    return (pass);
};