pool2d_fwd_common.hpp 8 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

Qianfeng's avatar
Qianfeng committed
4
5
#pragma once

6
#include <iostream>
7

Chao Liu's avatar
Chao Liu committed
8
9
10
11
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
12
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
Chao Liu's avatar
Chao Liu committed
13
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
14

Chao Liu's avatar
Chao Liu committed
15
#include "ck/library/utility/check_err.hpp"
16
17
18
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
19
#include "ck/library/utility/literals.hpp"
rocking's avatar
rocking committed
20
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
21
22
23

template <typename InDataType,
          typename OutDataType,
rocking's avatar
rocking committed
24
          typename ComputeDataType,
Qianfeng's avatar
Qianfeng committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
          typename IndexDataType,
          typename InLayout,
          typename OutLayout,
          ck::ReduceTensorOp ReduceOpId,
          bool PropagateNan,
          bool OutputIndex>
bool pool_test(bool do_verification,
               int init_method,
               bool time_kernel,
               ck::index_t N,
               ck::index_t C,
               ck::index_t Y,
               ck::index_t X,
               ck::index_t Hi,
               ck::index_t Wi,
               ck::index_t window_stride_h,
               ck::index_t window_stride_w,
               ck::index_t in_left_pad_h,
               ck::index_t in_left_pad_w,
               ck::index_t in_right_pad_h,
               ck::index_t in_right_pad_w)
46
{
Qianfeng's avatar
Qianfeng committed
47
48
    using DevicePoolFwdInstance =
        ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
rocking's avatar
rocking committed
49
50
51
52
            InDataType,      // InDataType
            OutDataType,     // OutDataType
            IndexDataType,   // IndexDataType
            ComputeDataType, // ComputeDataType
Qianfeng's avatar
Qianfeng committed
53
54
55
56
57
58
59
60
            ReduceOpId,
            OutputIndex,
            64, // BlockSize
            64, // ReduceMThreadClusterSize
            1,  // ReduceKThreadClusterSize
            4,  // ReduceMThreadSliceSize
            1,  // ReduceKThreadSliceSize
            4>; // InSrcOutDstVectorSize
61
62
63
64

    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;

rocking's avatar
rocking committed
65
66
67
68
    const std::vector<ck::index_t> window_spatial_lengths{Y, X};
    const std::vector<ck::index_t> window_strides{window_stride_h, window_stride_w};
    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
69
70
71
72

    // tensor layout
    auto f_host_tensor_descriptor =
        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
73
74
            using namespace ck::literals;

75
76
            if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
            {
77
                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
78
79
80
81
            }
            else if constexpr(ck::is_same<decltype(layout),
                                          ck::tensor_layout::convolution::NHWC>::value)
            {
82
                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
83
84
85
86
87
            }
        };

    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
88
89
    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(
        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
90
    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
91
92
    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(
        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
93
94
95
96
97
98
99

    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
    std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;

    switch(init_method)
    {
    case 0: break;
100
101
102
    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); break;
    case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
103
104
    }

105
106
107
    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
    DeviceMem out_device_buf(sizeof(OutDataType) *
                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
108
    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
109
                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
110
111
112

    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());

113
114
115
116
117
118
    auto pool         = DevicePoolFwdInstance{};
    auto invoker_ptr  = pool.MakeInvokerPointer();
    auto argument_ptr = pool.MakeArgumentPointer(
        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
        static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
rocking's avatar
rocking committed
119
120
121
122
123
124
        {N, C, Hi, Wi},
        {Y, X},
        {N, C, Ho, Wo},
        {C * Hi * Wi, 1, Wi * C, C},
        {C * Ho * Wo, 1, Wo * C, C},
        {C * Ho * Wo, 1, Wo * C, C},
125
126
        window_strides,
        input_left_pads,
rocking's avatar
rocking committed
127
128
        input_right_pads,
        {2, 3});
129
130
131
132
133
134
135

    if(!pool.IsSupportedArgument(argument_ptr.get()))
    {
        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
                                 "not support this problem");
    }

JD's avatar
JD committed
136
    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
137
138
139
140
141
142
143
144
145
146
147
148
149

    std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;

    std::size_t num_btype =
        sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(OutDataType) * (N * C * Ho * Wo);

    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

    float gb_per_sec = num_btype / 1.E6 / ave_time;

    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;

Anthony Chang's avatar
Anthony Chang committed
150
    bool pass = true;
151

152
153
    if(do_verification)
    {
rocking's avatar
rocking committed
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
        using ReferencePoolingFwdInstance =
            ck::tensor_operation::host::ReferencePoolingFwd<4,
                                                            2,
                                                            InDataType,
                                                            OutDataType,
                                                            ComputeDataType,
                                                            IndexDataType,
                                                            ReduceOpId,
                                                            PropagateNan,
                                                            OutputIndex>;

        auto ref_pooling          = ReferencePoolingFwdInstance{};
        auto ref_pooling_invoker  = ref_pooling.MakeInvoker();
        auto ref_pooling_argument = ref_pooling.MakeArgument(in_n_c_hi_wi,
                                                             out_n_c_ho_wo_host,
                                                             out_indices_n_c_ho_wo_host,
                                                             window_spatial_lengths,
                                                             window_strides,
                                                             input_left_pads,
                                                             input_right_pads);

        ref_pooling_invoker.Run(ref_pooling_argument);
176
177
178

        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());

179
        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
180

181
        if constexpr(OutputIndex)
182
183
184
        {
            out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());

185
186
            pass = pass &&
                   ck::utils::check_err(out_indices_n_c_ho_wo_device, out_indices_n_c_ho_wo_host);
187
188
        };
    }
189

Qianfeng's avatar
Qianfeng committed
190
191
    return (pass);
};