conv.cu 18.7 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#include <iostream>
Chao Liu's avatar
Chao Liu committed
2
3
#include <numeric>
#include <initializer_list>
Chao Liu's avatar
Chao Liu committed
4
#include <cstdlib>
Chao Liu's avatar
Chao Liu committed
5
6
#include "nvToolsExt.h"
#include "tensor.hpp"
Chao Liu's avatar
Chao Liu committed
7
#include "ConstantTensorDescriptor.cuh"
Chao Liu's avatar
Chao Liu committed
8
#include "conv_common.cuh"
Chao Liu's avatar
rename  
Chao Liu committed
9
10
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
Chao Liu's avatar
Chao Liu committed
11
12
#include "device_implicit_gemm_convolution_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_nchw_srck.cuh"
Chao Liu's avatar
Chao Liu committed
13
//#include "device_winograd_convolution.cuh"
Chao Liu's avatar
Chao Liu committed
14

Chao Liu's avatar
Chao Liu committed
15
struct GeneratorTensor_1
Chao Liu's avatar
Chao Liu committed
16
17
{
    template <class... Is>
Chao Liu's avatar
Chao Liu committed
18
    double operator()(Is... is)
Chao Liu's avatar
Chao Liu committed
19
    {
Chao Liu's avatar
Chao Liu committed
20
        return 1;
Chao Liu's avatar
Chao Liu committed
21
22
23
    }
};

Chao Liu's avatar
Chao Liu committed
24
25
26
27
28
29
30
31
32
33
34
35
struct GeneratorTensor_2
{
    int min_value = 0;
    int max_value = 1;

    template <class... Is>
    double operator()(Is...)
    {
        return (std::rand() % (max_value - min_value)) + min_value;
    }
};

Chao Liu's avatar
Chao Liu committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
struct GeneratorTensor_3
{
    template <class... Is>
    double operator()(Is... is)
    {
#if 0
        std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
        return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
#elif 1
        assert(sizeof...(Is) > 0);
        std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
        std::vector<std::size_t> lens(sizeof...(Is), 100);
        std::vector<std::size_t> strides(sizeof...(Is), 1);
        std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1);
        return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
#endif
    }
};

Chao Liu's avatar
Chao Liu committed
55
56
57
58
59
60
// this is ugly, only for 4d
template <class TConstTensorDesc>
void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout)
{
    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");

Chao Liu's avatar
Chao Liu committed
61
62
63
64
    constexpr auto I0   = Number<0>{};
    constexpr auto I1   = Number<1>{};
    constexpr auto I2   = Number<2>{};
    constexpr auto I3   = Number<3>{};
Chao Liu's avatar
Chao Liu committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
    constexpr auto desc = TConstTensorDesc{};

    os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
       << desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, "
       << "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", "
       << desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl;
}

// this is ugly, only for 4d
template <class TConstTensorDesc>
auto make_TensorDescriptor(TConstTensorDesc)
{
    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");

Chao Liu's avatar
Chao Liu committed
79
80
81
82
    constexpr auto I0   = Number<0>{};
    constexpr auto I1   = Number<1>{};
    constexpr auto I2   = Number<2>{};
    constexpr auto I3   = Number<3>{};
Chao Liu's avatar
Chao Liu committed
83
84
85
86
87
88
89
90
91
92
93
    constexpr auto desc = TConstTensorDesc{};

    std::initializer_list<unsigned> lengths = {
        desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)};
    std::initializer_list<unsigned> strides = {
        desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};

    return TensorDescriptor(lengths, strides);
}

template <class T>
Chao Liu's avatar
Chao Liu committed
94
void host_direct_convolution(const Tensor<T>& in_nchw, const Tensor<T>& wei_kcsr, Tensor<T>& out)
Chao Liu's avatar
Chao Liu committed
95
96
97
{
    auto f = [&](auto n, auto k, auto ho, auto wo) {
        double v = 0;
Chao Liu's avatar
Chao Liu committed
98
        for(int c = 0; c < wei_kcsr.mDesc.GetLengths()[1]; ++c)
Chao Liu's avatar
Chao Liu committed
99
        {
Chao Liu's avatar
Chao Liu committed
100
            for(int y = 0; y < wei_kcsr.mDesc.GetLengths()[2]; ++y)
Chao Liu's avatar
Chao Liu committed
101
102
            {
                int hi = ho + y;
Chao Liu's avatar
Chao Liu committed
103
                for(int x = 0; x < wei_kcsr.mDesc.GetLengths()[3]; ++x)
Chao Liu's avatar
Chao Liu committed
104
105
                {
                    int wi = wo + x;
Chao Liu's avatar
Chao Liu committed
106
                    v += in_nchw(n, c, hi, wi) * wei_kcsr(k, c, y, x);
Chao Liu's avatar
Chao Liu committed
107
108
109
110
111
112
113
114
115
116
117
118
                }
            }
        }
        out(n, k, ho, wo) = v;
    };

    auto f_par = make_ParallelTensorFunctor(f,
                                            out.mDesc.GetLengths()[0],
                                            out.mDesc.GetLengths()[1],
                                            out.mDesc.GetLengths()[2],
                                            out.mDesc.GetLengths()[3]);

Chao Liu's avatar
Chao Liu committed
119
    f_par(std::thread::hardware_concurrency());
Chao Liu's avatar
Chao Liu committed
120
121
}

Chao Liu's avatar
Chao Liu committed
122
template <class T>
Chao Liu's avatar
Chao Liu committed
123
124
125
void host_winograd_3x3_convolution(const Tensor<T>& in_nchw,
                                   const Tensor<T>& wei_kcsr,
                                   Tensor<T>& out)
Chao Liu's avatar
Chao Liu committed
126
{
Chao Liu's avatar
Chao Liu committed
127
128
129
    constexpr std::size_t OutTileSizeH = 2;
    constexpr std::size_t OutTileSizeW = 2;

Chao Liu's avatar
Chao Liu committed
130
131
132
133
    std::size_t N  = in_nchw.mDesc.GetLengths()[0];
    std::size_t C  = in_nchw.mDesc.GetLengths()[1];
    std::size_t HI = in_nchw.mDesc.GetLengths()[2];
    std::size_t WI = in_nchw.mDesc.GetLengths()[3];
Chao Liu's avatar
Chao Liu committed
134

Chao Liu's avatar
Chao Liu committed
135
136
137
    std::size_t K = wei_kcsr.mDesc.GetLengths()[0];
    std::size_t S = wei_kcsr.mDesc.GetLengths()[2];
    std::size_t R = wei_kcsr.mDesc.GetLengths()[3];
Chao Liu's avatar
Chao Liu committed
138
139
140
141
142
143
144
145
146

    std::size_t HO = out.mDesc.GetLengths()[2];
    std::size_t WO = out.mDesc.GetLengths()[3];

    std::size_t InTileSizeH = OutTileSizeH + S - 1;
    std::size_t InTileSizeW = OutTileSizeW + R - 1;

    std::size_t Y = (HO + OutTileSizeH - 1) / OutTileSizeH;
    std::size_t X = (WO + OutTileSizeW - 1) / OutTileSizeW;
Chao Liu's avatar
Chao Liu committed
147

Chao Liu's avatar
Chao Liu committed
148
149
150
151
152
153
154
155
156
157
158
159
160
    Tensor<T> in_hold({N, C, Y, X, InTileSizeH, InTileSizeW});
    Tensor<T> in_transform({N, C, Y, X, InTileSizeH, InTileSizeW});
    Tensor<T> wei_transform({K, C, InTileSizeH, InTileSizeW});
    Tensor<T> out_transform({N, K, Y, X, InTileSizeH, InTileSizeH});
    Tensor<T> out_hold({N, K, Y, X, OutTileSizeH, OutTileSizeW});

    auto f_in_hold = [&](auto n, auto c, auto y, auto x) {
        for(int j = 0; j < InTileSizeH; ++j)
        {
            std::size_t hi = OutTileSizeH * y + j;
            for(int i = 0; i < InTileSizeW; ++i)
            {
                std::size_t wi            = OutTileSizeW * x + i;
Chao Liu's avatar
Chao Liu committed
161
                in_hold(n, c, y, x, j, i) = in_nchw(n, c, hi, wi);
Chao Liu's avatar
Chao Liu committed
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
            }
        }
    };

    auto f_in_transform = [&](auto n, auto c, auto y, auto x) {
        in_transform(n, c, y, x, 0, 0) = in_hold(n, c, y, x, 0, 0) - in_hold(n, c, y, x, 0, 2) -
                                         in_hold(n, c, y, x, 2, 0) + in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 0, 1) = in_hold(n, c, y, x, 0, 1) + in_hold(n, c, y, x, 0, 2) -
                                         in_hold(n, c, y, x, 2, 1) - in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 0, 2) = -in_hold(n, c, y, x, 0, 1) + in_hold(n, c, y, x, 0, 2) +
                                         in_hold(n, c, y, x, 2, 1) - in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 0, 3) = in_hold(n, c, y, x, 0, 1) - in_hold(n, c, y, x, 0, 3) -
                                         in_hold(n, c, y, x, 2, 1) + in_hold(n, c, y, x, 2, 3);

        in_transform(n, c, y, x, 1, 0) = in_hold(n, c, y, x, 1, 0) - in_hold(n, c, y, x, 1, 2) +
                                         in_hold(n, c, y, x, 2, 0) - in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 1, 1) = in_hold(n, c, y, x, 1, 1) + in_hold(n, c, y, x, 1, 2) +
                                         in_hold(n, c, y, x, 2, 1) + in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 1, 2) = -in_hold(n, c, y, x, 1, 1) + in_hold(n, c, y, x, 1, 2) -
                                         in_hold(n, c, y, x, 2, 1) + in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 1, 3) = in_hold(n, c, y, x, 1, 1) - in_hold(n, c, y, x, 1, 3) +
                                         in_hold(n, c, y, x, 2, 1) - in_hold(n, c, y, x, 2, 3);

        in_transform(n, c, y, x, 2, 0) = -in_hold(n, c, y, x, 1, 0) + in_hold(n, c, y, x, 1, 2) +
                                         in_hold(n, c, y, x, 2, 0) - in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 2, 1) = -in_hold(n, c, y, x, 1, 1) - in_hold(n, c, y, x, 1, 2) +
                                         in_hold(n, c, y, x, 2, 1) + in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 2, 2) = in_hold(n, c, y, x, 1, 1) - in_hold(n, c, y, x, 1, 2) -
                                         in_hold(n, c, y, x, 2, 1) + in_hold(n, c, y, x, 2, 2);
        in_transform(n, c, y, x, 2, 3) = -in_hold(n, c, y, x, 1, 1) + in_hold(n, c, y, x, 1, 3) +
                                         in_hold(n, c, y, x, 2, 1) - in_hold(n, c, y, x, 2, 3);

        in_transform(n, c, y, x, 3, 0) = in_hold(n, c, y, x, 1, 0) - in_hold(n, c, y, x, 1, 2) -
                                         in_hold(n, c, y, x, 3, 0) + in_hold(n, c, y, x, 3, 2);
        in_transform(n, c, y, x, 3, 1) = in_hold(n, c, y, x, 1, 1) + in_hold(n, c, y, x, 1, 2) -
                                         in_hold(n, c, y, x, 3, 1) - in_hold(n, c, y, x, 3, 2);
        in_transform(n, c, y, x, 3, 2) = -in_hold(n, c, y, x, 1, 1) + in_hold(n, c, y, x, 1, 2) +
                                         in_hold(n, c, y, x, 3, 1) - in_hold(n, c, y, x, 3, 2);
        in_transform(n, c, y, x, 3, 3) = in_hold(n, c, y, x, 1, 1) - in_hold(n, c, y, x, 1, 3) -
                                         in_hold(n, c, y, x, 3, 1) + in_hold(n, c, y, x, 3, 3);
    };

    auto f_wei_transform = [&](auto k, auto c) {
Chao Liu's avatar
Chao Liu committed
205
        wei_transform(k, c, 0, 0) = wei_kcsr(k, c, 0, 0);
Chao Liu's avatar
Chao Liu committed
206
        wei_transform(k, c, 0, 1) =
Chao Liu's avatar
Chao Liu committed
207
            0.5 * wei_kcsr(k, c, 0, 0) + 0.5 * wei_kcsr(k, c, 0, 1) + 0.5 * wei_kcsr(k, c, 0, 2);
Chao Liu's avatar
Chao Liu committed
208
        wei_transform(k, c, 0, 2) =
Chao Liu's avatar
Chao Liu committed
209
210
            0.5 * wei_kcsr(k, c, 0, 0) - 0.5 * wei_kcsr(k, c, 0, 1) + 0.5 * wei_kcsr(k, c, 0, 2);
        wei_transform(k, c, 0, 3) = wei_kcsr(k, c, 0, 2);
Chao Liu's avatar
Chao Liu committed
211
212

        wei_transform(k, c, 1, 0) =
Chao Liu's avatar
Chao Liu committed
213
214
215
216
217
218
219
220
221
222
223
            0.5 * wei_kcsr(k, c, 0, 0) + 0.5 * wei_kcsr(k, c, 1, 0) + 0.5 * wei_kcsr(k, c, 2, 0);
        wei_transform(k, c, 1, 1) = 0.25 * wei_kcsr(k, c, 0, 0) + 0.25 * wei_kcsr(k, c, 0, 1) +
                                    0.25 * wei_kcsr(k, c, 0, 2) + 0.25 * wei_kcsr(k, c, 1, 0) +
                                    0.25 * wei_kcsr(k, c, 1, 1) + 0.25 * wei_kcsr(k, c, 1, 2) +
                                    0.25 * wei_kcsr(k, c, 2, 0) + 0.25 * wei_kcsr(k, c, 2, 1) +
                                    0.25 * wei_kcsr(k, c, 2, 2);
        wei_transform(k, c, 1, 2) = 0.25 * wei_kcsr(k, c, 0, 0) - 0.25 * wei_kcsr(k, c, 0, 1) +
                                    0.25 * wei_kcsr(k, c, 0, 2) + 0.25 * wei_kcsr(k, c, 1, 0) -
                                    0.25 * wei_kcsr(k, c, 1, 1) + 0.25 * wei_kcsr(k, c, 1, 2) +
                                    0.25 * wei_kcsr(k, c, 2, 0) - 0.25 * wei_kcsr(k, c, 2, 1) +
                                    0.25 * wei_kcsr(k, c, 2, 2);
Chao Liu's avatar
Chao Liu committed
224
        wei_transform(k, c, 1, 3) =
Chao Liu's avatar
Chao Liu committed
225
            0.5 * wei_kcsr(k, c, 0, 2) + 0.5 * wei_kcsr(k, c, 1, 2) + 0.5 * wei_kcsr(k, c, 2, 2);
Chao Liu's avatar
Chao Liu committed
226
227

        wei_transform(k, c, 2, 0) =
Chao Liu's avatar
Chao Liu committed
228
229
230
231
232
233
234
235
236
237
238
            0.5 * wei_kcsr(k, c, 0, 0) - 0.5 * wei_kcsr(k, c, 1, 0) + 0.5 * wei_kcsr(k, c, 2, 0);
        wei_transform(k, c, 2, 1) = 0.25 * wei_kcsr(k, c, 0, 0) + 0.25 * wei_kcsr(k, c, 0, 1) +
                                    0.25 * wei_kcsr(k, c, 0, 2) - 0.25 * wei_kcsr(k, c, 1, 0) -
                                    0.25 * wei_kcsr(k, c, 1, 1) - 0.25 * wei_kcsr(k, c, 1, 2) +
                                    0.25 * wei_kcsr(k, c, 2, 0) + 0.25 * wei_kcsr(k, c, 2, 1) +
                                    0.25 * wei_kcsr(k, c, 2, 2);
        wei_transform(k, c, 2, 2) = 0.25 * wei_kcsr(k, c, 0, 0) - 0.25 * wei_kcsr(k, c, 0, 1) +
                                    0.25 * wei_kcsr(k, c, 0, 2) - 0.25 * wei_kcsr(k, c, 1, 0) +
                                    0.25 * wei_kcsr(k, c, 1, 1) - 0.25 * wei_kcsr(k, c, 1, 2) +
                                    0.25 * wei_kcsr(k, c, 2, 0) - 0.25 * wei_kcsr(k, c, 2, 1) +
                                    0.25 * wei_kcsr(k, c, 2, 2);
Chao Liu's avatar
Chao Liu committed
239
        wei_transform(k, c, 2, 3) =
Chao Liu's avatar
Chao Liu committed
240
            0.5 * wei_kcsr(k, c, 0, 2) - 0.5 * wei_kcsr(k, c, 1, 2) + 0.5 * wei_kcsr(k, c, 2, 2);
Chao Liu's avatar
Chao Liu committed
241

Chao Liu's avatar
Chao Liu committed
242
        wei_transform(k, c, 3, 0) = wei_kcsr(k, c, 2, 0);
Chao Liu's avatar
Chao Liu committed
243
        wei_transform(k, c, 3, 1) =
Chao Liu's avatar
Chao Liu committed
244
            0.5 * wei_kcsr(k, c, 2, 0) + 0.5 * wei_kcsr(k, c, 2, 1) + 0.5 * wei_kcsr(k, c, 2, 2);
Chao Liu's avatar
Chao Liu committed
245
        wei_transform(k, c, 3, 2) =
Chao Liu's avatar
Chao Liu committed
246
247
            0.5 * wei_kcsr(k, c, 2, 0) - 0.5 * wei_kcsr(k, c, 2, 1) + 0.5 * wei_kcsr(k, c, 2, 2);
        wei_transform(k, c, 3, 3) = wei_kcsr(k, c, 2, 2);
Chao Liu's avatar
Chao Liu committed
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
    };

    auto f_out_transform = [&](auto n, auto k, auto y, auto x) {
        for(int j = 0; j < InTileSizeH; ++j)
        {
            for(int i = 0; i < InTileSizeW; ++i)
            {
                double v = 0;
                for(int c = 0; c < C; ++c)
                {
                    v += in_transform(n, c, y, x, j, i) * wei_transform(k, c, j, i);
                }

                out_transform(n, k, y, x, j, i) = v;
            }
        }
    };

    auto f_out_hold = [&](auto n, auto k, auto y, auto x) {
        out_hold(n, k, y, x, 0, 0) =
            out_transform(n, k, y, x, 0, 0) + out_transform(n, k, y, x, 0, 1) +
            out_transform(n, k, y, x, 0, 2) + out_transform(n, k, y, x, 1, 0) +
            out_transform(n, k, y, x, 1, 1) + out_transform(n, k, y, x, 1, 2) +
            out_transform(n, k, y, x, 2, 0) + out_transform(n, k, y, x, 2, 1) +
            out_transform(n, k, y, x, 2, 2);
        out_hold(n, k, y, x, 0, 1) =
            out_transform(n, k, y, x, 0, 1) - out_transform(n, k, y, x, 0, 2) -
            out_transform(n, k, y, x, 0, 3) + out_transform(n, k, y, x, 1, 1) -
            out_transform(n, k, y, x, 1, 2) - out_transform(n, k, y, x, 1, 3) +
            out_transform(n, k, y, x, 2, 1) - out_transform(n, k, y, x, 2, 2) -
            out_transform(n, k, y, x, 2, 3);
        out_hold(n, k, y, x, 1, 0) =
            out_transform(n, k, y, x, 1, 0) + out_transform(n, k, y, x, 1, 1) +
            out_transform(n, k, y, x, 1, 2) - out_transform(n, k, y, x, 2, 0) -
            out_transform(n, k, y, x, 2, 1) - out_transform(n, k, y, x, 2, 2) -
            out_transform(n, k, y, x, 3, 0) - out_transform(n, k, y, x, 3, 1) -
            out_transform(n, k, y, x, 3, 2);
        out_hold(n, k, y, x, 1, 1) =
            out_transform(n, k, y, x, 1, 1) - out_transform(n, k, y, x, 1, 2) -
            out_transform(n, k, y, x, 1, 3) - out_transform(n, k, y, x, 2, 1) +
            out_transform(n, k, y, x, 2, 2) + out_transform(n, k, y, x, 2, 3) -
            out_transform(n, k, y, x, 3, 1) + out_transform(n, k, y, x, 3, 2) +
            out_transform(n, k, y, x, 3, 3);
    };

    auto f_out = [&](auto n, auto k, auto y, auto x) {
        for(int j = 0; j < OutTileSizeH; ++j)
        {
            std::size_t ho = OutTileSizeH * y + j;
            for(int i = 0; i < OutTileSizeW; ++i)
            {
                std::size_t wo    = OutTileSizeW * x + i;
                out(n, k, ho, wo) = out_hold(n, k, y, x, j, i);
            }
        }
    };

    std::size_t num_thread = std::thread::hardware_concurrency();

    make_ParallelTensorFunctor(f_in_hold, N, C, Y, X)(num_thread);
    make_ParallelTensorFunctor(f_in_transform, N, C, Y, X)(num_thread);
    make_ParallelTensorFunctor(f_wei_transform, K, C)(num_thread);
    make_ParallelTensorFunctor(f_out_transform, N, K, Y, X)(num_thread);
    make_ParallelTensorFunctor(f_out_hold, N, K, Y, X)(num_thread);
    make_ParallelTensorFunctor(f_out, N, K, Y, X)(num_thread);
}

template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
    float error     = 0;
Chao Liu's avatar
Chao Liu committed
319
    float max_diff  = -1;
Chao Liu's avatar
Chao Liu committed
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
    float ref_value = 0, result_value = 0;
    for(int i = 0; i < ref.mData.size(); ++i)
    {
        error += std::abs(ref.mData[i] - result.mData[i]);
        float diff = std::abs(ref.mData[i] - result.mData[i]);
        if(max_diff < diff)
        {
            max_diff     = diff;
            ref_value    = ref.mData[i];
            result_value = result.mData[i];
        }
    }

    std::cout << "error: " << error << std::endl;
    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}

int main()
{
Chao Liu's avatar
Chao Liu committed
339
#if 0
Chao Liu's avatar
Chao Liu committed
340
341
    constexpr unsigned N = 1;
    constexpr unsigned C = 1;
Chao Liu's avatar
Chao Liu committed
342
343
    constexpr unsigned HI = 34;
    constexpr unsigned WI = 34;
Chao Liu's avatar
Chao Liu committed
344
    constexpr unsigned K = 1;
Chao Liu's avatar
Chao Liu committed
345
346
    constexpr unsigned S = 3;
    constexpr unsigned R = 3;
Chao Liu's avatar
Chao Liu committed
347
#elif 1
Chao Liu's avatar
Chao Liu committed
348
349
    constexpr unsigned N = 64;
    constexpr unsigned C = 256;
Chao Liu's avatar
Chao Liu committed
350
351
    constexpr unsigned HI = 34;
    constexpr unsigned WI = 34;
Chao Liu's avatar
Chao Liu committed
352
353
354
    constexpr unsigned K = 64;
    constexpr unsigned S = 3;
    constexpr unsigned R = 3;
Chao Liu's avatar
Chao Liu committed
355
#elif 0
Chao Liu's avatar
Chao Liu committed
356
357
    constexpr unsigned N  = 64;
    constexpr unsigned C  = 64;
Chao Liu's avatar
Chao Liu committed
358
359
    constexpr unsigned HI = 56;
    constexpr unsigned WI = 56;
Chao Liu's avatar
Chao Liu committed
360
361
362
    constexpr unsigned K  = 64;
    constexpr unsigned S  = 3;
    constexpr unsigned R  = 3;
Chao Liu's avatar
Chao Liu committed
363
#elif 0
Chao Liu's avatar
Chao Liu committed
364
    constexpr unsigned N  = 64;
Chao Liu's avatar
Chao Liu committed
365
366
367
    constexpr unsigned C  = 256;
    constexpr unsigned HI = 36;
    constexpr unsigned WI = 36;
Chao Liu's avatar
Chao Liu committed
368
    constexpr unsigned K  = 64;
Chao Liu's avatar
Chao Liu committed
369
370
    constexpr unsigned S  = 5;
    constexpr unsigned R  = 5;
Chao Liu's avatar
Chao Liu committed
371
#endif
Chao Liu's avatar
Chao Liu committed
372

Chao Liu's avatar
Chao Liu committed
373
374
375
376
    auto in_nchw_desc  = make_ConstantTensorDescriptor(Sequence<N, C, HI, WI>{});
    auto wei_kcsr_desc = make_ConstantTensorDescriptor(Sequence<K, C, S, R>{});
    auto out_nkhw_desc =
        get_convolution_output_default_4d_tensor_descriptor(in_nchw_desc, wei_kcsr_desc);
Chao Liu's avatar
Chao Liu committed
377

Chao Liu's avatar
Chao Liu committed
378
379
380
    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
    ostream_ConstantTensorDescriptor(wei_kcsr_desc, std::cout << "wei_kcsr_desc: ");
    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
Chao Liu's avatar
Chao Liu committed
381

Chao Liu's avatar
Chao Liu committed
382
383
384
385
    Tensor<float> in_nchw(make_TensorDescriptor(in_nchw_desc));
    Tensor<float> wei_kcsr(make_TensorDescriptor(wei_kcsr_desc));
    Tensor<float> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
    Tensor<float> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
Chao Liu's avatar
Chao Liu committed
386

Chao Liu's avatar
Chao Liu committed
387
    std::size_t num_thread = std::thread::hardware_concurrency();
Chao Liu's avatar
Chao Liu committed
388
389

#if 0
Chao Liu's avatar
Chao Liu committed
390
391
    in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
    wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
Chao Liu's avatar
Chao Liu committed
392
#elif 1
Chao Liu's avatar
Chao Liu committed
393
394
    in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
    wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
Chao Liu's avatar
Chao Liu committed
395
396
#endif

Chao Liu's avatar
Chao Liu committed
397
    for(int i = 0; i < 40; ++i)
Chao Liu's avatar
Chao Liu committed
398
    {
Chao Liu's avatar
Chao Liu committed
399
#if 0
Chao Liu's avatar
Chao Liu committed
400
        device_direct_convolution_1(in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
401
#elif 0
Chao Liu's avatar
Chao Liu committed
402
403
404
        device_direct_convolution_2(
            in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
#elif 0
Chao Liu's avatar
Chao Liu committed
405
        device_implicit_gemm_convolution_nchw_kcsr(
Chao Liu's avatar
Chao Liu committed
406
            in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
Chao Liu's avatar
Chao Liu committed
407
#elif 1
Chao Liu's avatar
Chao Liu committed
408
409
        device_implicit_gemm_convolution_nchw_srck(
            in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
Chao Liu's avatar
Chao Liu committed
410
#elif 0
Chao Liu's avatar
Chao Liu committed
411
412
        device_winograd_convolution(
            in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
413
#endif
Chao Liu's avatar
Chao Liu committed
414
    }
Chao Liu's avatar
Chao Liu committed
415

Chao Liu's avatar
Chao Liu committed
416
#if 1
Chao Liu's avatar
Chao Liu committed
417
418
    host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host);
    check_error(out_nkhw_host, out_nkhw_device);
419
#elif 0
Chao Liu's avatar
Chao Liu committed
420
421
    host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host);
    check_error(out_nkhw_host, out_nkhw_device);
422
#endif
Chao Liu's avatar
Chao Liu committed
423

Chao Liu's avatar
Chao Liu committed
424
#if 0
Chao Liu's avatar
Chao Liu committed
425
426
427
428
    LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
    LogRange(std::cout << "wei_kcsr: ", wei_kcsr.mData, ",") << std::endl;
    LogRange(std::cout << "out_nkhw_host  : ", out_nkhw_host.mData, ",") << std::endl;
    LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
Chao Liu's avatar
Chao Liu committed
429
#endif
430
}