driver.hip.cpp 30.6 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
#include <iostream>
Chao Liu's avatar
Chao Liu committed
2
3
#include <numeric>
#include <initializer_list>
Chao Liu's avatar
Chao Liu committed
4
#include <cstdlib>
Chao Liu's avatar
Chao Liu committed
5
#include <stdlib.h>
Chao Liu's avatar
Chao Liu committed
6
#include "config.h"
Chao Liu's avatar
Chao Liu committed
7
#include "tensor.hpp"
8
9
#include "ConstantTensorDescriptor.hip.hpp"
#include "conv_common.hip.hpp"
Chao Liu's avatar
Chao Liu committed
10
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
Chao Liu's avatar
Chao Liu committed
11
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
Chao Liu's avatar
Chao Liu committed
12
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
13
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
Chao Liu's avatar
Chao Liu committed
14
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
Chao Liu's avatar
Chao Liu committed
15
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
Chao Liu's avatar
Chao Liu committed
16
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
17
#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
Chao Liu's avatar
Chao Liu committed
18

Chao Liu's avatar
Chao Liu committed
19
struct GeneratorTensor_1
Chao Liu's avatar
Chao Liu committed
20
21
{
    template <class... Is>
Chao Liu's avatar
Chao Liu committed
22
    double operator()(Is... is)
Chao Liu's avatar
Chao Liu committed
23
    {
Chao Liu's avatar
Chao Liu committed
24
        return 1;
Chao Liu's avatar
Chao Liu committed
25
26
27
    }
};

Chao Liu's avatar
Chao Liu committed
28
29
30
31
32
33
34
35
36
37
38
39
struct GeneratorTensor_2
{
    int min_value = 0;
    int max_value = 1;

    template <class... Is>
    double operator()(Is...)
    {
        return (std::rand() % (max_value - min_value)) + min_value;
    }
};

40
41
42
43
44
45
46
struct GeneratorTensor_3
{
    template <class... Is>
    double operator()(Is... is)
    {
        std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};

47
        auto f_acc = [](auto a, auto b) { return 100 * a + b; };
48

49
        return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
50
51
52
    }
};

Chao Liu's avatar
Chao Liu committed
53
54
55
56
57
struct GeneratorTensor_Checkboard
{
    template <class... Ts>
    double operator()(Ts... Xs) const
    {
Chao Liu's avatar
Chao Liu committed
58
        std::array<index_t, sizeof...(Ts)> dims = {{Xs...}};
Chao Liu's avatar
Chao Liu committed
59
60
61
        return std::accumulate(dims.begin(),
                               dims.end(),
                               true,
Chao Liu's avatar
Chao Liu committed
62
                               [](bool init, index_t x) -> int { return init != (x % 2); })
Chao Liu's avatar
Chao Liu committed
63
64
65
66
67
                   ? 1
                   : -1;
    }
};

Chao Liu's avatar
Chao Liu committed
68
69
70
71
72
73
// this is ugly, only for 4d
template <class TConstTensorDesc>
void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout)
{
    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");

Chao Liu's avatar
Chao Liu committed
74
75
76
77
    constexpr auto I0   = Number<0>{};
    constexpr auto I1   = Number<1>{};
    constexpr auto I2   = Number<2>{};
    constexpr auto I3   = Number<3>{};
Chao Liu's avatar
Chao Liu committed
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    constexpr auto desc = TConstTensorDesc{};

    os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
       << desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, "
       << "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", "
       << desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl;
}

// this is ugly, only for 4d
template <class TConstTensorDesc>
auto make_TensorDescriptor(TConstTensorDesc)
{
    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");

Chao Liu's avatar
Chao Liu committed
92
93
94
95
    constexpr auto I0   = Number<0>{};
    constexpr auto I1   = Number<1>{};
    constexpr auto I2   = Number<2>{};
    constexpr auto I3   = Number<3>{};
Chao Liu's avatar
Chao Liu committed
96
97
    constexpr auto desc = TConstTensorDesc{};

Chao Liu's avatar
Chao Liu committed
98
    std::initializer_list<index_t> lengths = {
Chao Liu's avatar
Chao Liu committed
99
        desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)};
Chao Liu's avatar
Chao Liu committed
100
    std::initializer_list<index_t> strides = {
Chao Liu's avatar
Chao Liu committed
101
102
103
104
105
        desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};

    return TensorDescriptor(lengths, strides);
}

Jing Zhang's avatar
Jing Zhang committed
106
107
108
109
110
111
112
template <class TIn,
          class TWei,
          class TOut,
          class LowerPads,
          class UpperPads,
          class Strides,
          class Dilations>
Jing Zhang's avatar
Jing Zhang committed
113
void host_direct_convolution_forw(const Tensor<TIn>& in_nchw,
114
115
116
                             const Tensor<TWei>& wei_kcyx,
                             Tensor<TOut>& out_nkhw,
                             LowerPads,
Jing Zhang's avatar
Jing Zhang committed
117
                             UpperPads,
Jing Zhang's avatar
Jing Zhang committed
118
119
                             Strides,
                             Dilations)
Chao Liu's avatar
Chao Liu committed
120
{
Chao Liu's avatar
Chao Liu committed
121
122
    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
    index_t w_pad_low = LowerPads{}.Get(Number<1>{});
123

Chao Liu's avatar
Chao Liu committed
124
125
    index_t h_pad_up = UpperPads{}.Get(Number<0>{});
    index_t w_pad_up = UpperPads{}.Get(Number<1>{});
126

Jing Zhang's avatar
Jing Zhang committed
127
128
129
    index_t stride_h = Strides{}.Get(Number<0>{});
    index_t stride_w = Strides{}.Get(Number<1>{});

Jing Zhang's avatar
Jing Zhang committed
130
131
132
    index_t dilation_h = Dilations{}.Get(Number<0>{});
    index_t dilation_w = Dilations{}.Get(Number<1>{});

Chao Liu's avatar
Chao Liu committed
133
134
    auto f = [&](auto n, auto k, auto ho, auto wo) {
        double v = 0;
Chao Liu's avatar
Chao Liu committed
135
        for(int c = 0; c < wei_kcyx.mDesc.GetLengths()[1]; ++c)
Chao Liu's avatar
Chao Liu committed
136
        {
Chao Liu's avatar
Chao Liu committed
137
            for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y)
Chao Liu's avatar
Chao Liu committed
138
            {
Jing Zhang's avatar
Jing Zhang committed
139
                int hi = ho * stride_h + y * dilation_h - h_pad_low;
Chao Liu's avatar
Chao Liu committed
140
                for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x)
Chao Liu's avatar
Chao Liu committed
141
                {
Jing Zhang's avatar
Jing Zhang committed
142
                    int wi = wo * stride_w + x * dilation_w - w_pad_low;
143
144
145
                    if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
                       wi < in_nchw.mDesc.GetLengths()[3])
                    {
146
                        v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x));
147
                    }
Chao Liu's avatar
Chao Liu committed
148
149
150
                }
            }
        }
151
        out_nkhw(n, k, ho, wo) = v;
Chao Liu's avatar
Chao Liu committed
152
153
154
    };

    auto f_par = make_ParallelTensorFunctor(f,
155
156
157
158
                                            out_nkhw.mDesc.GetLengths()[0],
                                            out_nkhw.mDesc.GetLengths()[1],
                                            out_nkhw.mDesc.GetLengths()[2],
                                            out_nkhw.mDesc.GetLengths()[3]);
Chao Liu's avatar
Chao Liu committed
159

Chao Liu's avatar
Chao Liu committed
160
    f_par(std::thread::hardware_concurrency());
Chao Liu's avatar
Chao Liu committed
161
162
}

Jing Zhang's avatar
Jing Zhang committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
template <class TIn,
          class TWei,
          class TOut,
          class LowerPads,
          class UpperPads,
          class Strides,
          class Dilations>
void host_direct_convolution_back(Tensor<TOut>& in_nchw,
                             const Tensor<TWei>& wei_kcyx,
                             const Tensor<TIn>& out_nkhw,
                             LowerPads,
                             UpperPads,
                             Strides,
                             Dilations
                             )
{
    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
    index_t w_pad_low = LowerPads{}.Get(Number<1>{});

    index_t h_pad_up = UpperPads{}.Get(Number<0>{});
    index_t w_pad_up = UpperPads{}.Get(Number<1>{});

    index_t stride_h = Strides{}.Get(Number<0>{});
    index_t stride_w = Strides{}.Get(Number<1>{});

    index_t dilation_h = Dilations{}.Get(Number<0>{});
    index_t dilation_w = Dilations{}.Get(Number<1>{});

    //loop n,c,hi,wi
    auto f = [&](auto n, auto c, auto hi, auto wi) {
        double v = 0;
        //loop k,y,x
        for(int k = 0; k < wei_kcyx.mDesc.GetLengths()[0]; ++k)
        {
            for(int y = 0; y < wei_kcyx.mDesc.GetLengths()[2]; ++y)
            {
                int ho = (hi - y * dilation_h + h_pad_low) / stride_h;
                for(int x = 0; x < wei_kcyx.mDesc.GetLengths()[3]; ++x)
                {
                    int wo = (wi - x * dilation_w + w_pad_low) / stride_w;
                    if(ho >= 0 && hi < out_nkhw.mDesc.GetLengths()[2] && wo >= 0 &&
                       wo < out_nkhw.mDesc.GetLengths()[3] && ho % stride_h == 0 && wo % stride_w == 0)
                    {
                        v += double(out_nkhw(n, k, ho, wo)) * double(wei_kcyx(k, c, y, x));
                    }
                }
            }
        }
        in_nchw(n, c, hi, wi) = v;
    };

    auto f_par = make_ParallelTensorFunctor(f,
                                            in_nchw.mDesc.GetLengths()[0],
                                            in_nchw.mDesc.GetLengths()[1],
                                            in_nchw.mDesc.GetLengths()[2],
                                            in_nchw.mDesc.GetLengths()[3]);

    f_par(std::thread::hardware_concurrency());
}

223
224
225
226
227
228
template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
                                   const Tensor<TWei>& wei_kcyx,
                                   Tensor<TOut>& out_nkhw,
                                   LowerPads,
                                   UpperPads)
Chao Liu's avatar
Chao Liu committed
229
{
Chao Liu's avatar
Chao Liu committed
230
231
    constexpr std::size_t HoPerTile = 2;
    constexpr std::size_t WoPerTile = 2;
Chao Liu's avatar
Chao Liu committed
232

Chao Liu's avatar
Chao Liu committed
233
234
235
236
    std::size_t N  = in_nchw.mDesc.GetLengths()[0];
    std::size_t C  = in_nchw.mDesc.GetLengths()[1];
    std::size_t HI = in_nchw.mDesc.GetLengths()[2];
    std::size_t WI = in_nchw.mDesc.GetLengths()[3];
Chao Liu's avatar
Chao Liu committed
237

Chao Liu's avatar
Chao Liu committed
238
239
240
    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
Chao Liu's avatar
Chao Liu committed
241

242
243
    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
Chao Liu's avatar
Chao Liu committed
244

Chao Liu's avatar
Chao Liu committed
245
246
    index_t h_pad_low = LowerPads{}.Get(Number<0>{});
    index_t w_pad_low = LowerPads{}.Get(Number<1>{});
247

Chao Liu's avatar
Chao Liu committed
248
249
    index_t h_pad_up = UpperPads{}.Get(Number<0>{});
    index_t w_pad_up = UpperPads{}.Get(Number<1>{});
250

Chao Liu's avatar
Chao Liu committed
251
252
    std::size_t HiPerTile = HoPerTile + Y - 1;
    std::size_t WiPerTile = WoPerTile + X - 1;
Chao Liu's avatar
Chao Liu committed
253

Chao Liu's avatar
Chao Liu committed
254
255
    std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
    std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
Chao Liu's avatar
Chao Liu committed
256

257
258
259
260
261
    Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
    Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
    Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
    Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
    Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
Chao Liu's avatar
Chao Liu committed
262

Chao Liu's avatar
Chao Liu committed
263
264
    auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
        for(int j = 0; j < HiPerTile; ++j)
Chao Liu's avatar
Chao Liu committed
265
        {
Chao Liu's avatar
Chao Liu committed
266
267
            int hi = HoPerTile * htile + j - h_pad_low;
            for(int i = 0; i < WiPerTile; ++i)
Chao Liu's avatar
Chao Liu committed
268
            {
Chao Liu's avatar
Chao Liu committed
269
                int wi = WoPerTile * wtile + i - w_pad_low;
270
271
272
273

                if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
                   wi < in_nchw.mDesc.GetLengths()[3])
                {
Chao Liu's avatar
Chao Liu committed
274
                    in_hold(n, c, htile, wtile, j, i) = in_nchw(n, c, hi, wi);
275
276
277
                }
                else
                {
278
                    in_hold(n, c, htile, wtile, j, i) = TIn(0);
279
                }
Chao Liu's avatar
Chao Liu committed
280
281
282
283
            }
        }
    };

Chao Liu's avatar
Chao Liu committed
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
    auto f_in_transform = [&](auto n, auto c, auto htile, auto wtile) {
        in_transform(n, c, htile, wtile, 0, 0) =
            in_hold(n, c, htile, wtile, 0, 0) - in_hold(n, c, htile, wtile, 0, 2) -
            in_hold(n, c, htile, wtile, 2, 0) + in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 0, 1) =
            in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) -
            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 0, 2) =
            -in_hold(n, c, htile, wtile, 0, 1) + in_hold(n, c, htile, wtile, 0, 2) +
            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 0, 3) =
            in_hold(n, c, htile, wtile, 0, 1) - in_hold(n, c, htile, wtile, 0, 3) -
            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 3);

        in_transform(n, c, htile, wtile, 1, 0) =
            in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) +
            in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 1, 1) =
            in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 1, 2) =
            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 1, 3) =
            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) +
            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);

        in_transform(n, c, htile, wtile, 2, 0) =
            -in_hold(n, c, htile, wtile, 1, 0) + in_hold(n, c, htile, wtile, 1, 2) +
            in_hold(n, c, htile, wtile, 2, 0) - in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 2, 1) =
            -in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) +
            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 2, 2) =
            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 2) -
            in_hold(n, c, htile, wtile, 2, 1) + in_hold(n, c, htile, wtile, 2, 2);
        in_transform(n, c, htile, wtile, 2, 3) =
            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 3) +
            in_hold(n, c, htile, wtile, 2, 1) - in_hold(n, c, htile, wtile, 2, 3);

        in_transform(n, c, htile, wtile, 3, 0) =
            in_hold(n, c, htile, wtile, 1, 0) - in_hold(n, c, htile, wtile, 1, 2) -
            in_hold(n, c, htile, wtile, 3, 0) + in_hold(n, c, htile, wtile, 3, 2);
        in_transform(n, c, htile, wtile, 3, 1) =
            in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) -
            in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
        in_transform(n, c, htile, wtile, 3, 2) =
            -in_hold(n, c, htile, wtile, 1, 1) + in_hold(n, c, htile, wtile, 1, 2) +
            in_hold(n, c, htile, wtile, 3, 1) - in_hold(n, c, htile, wtile, 3, 2);
        in_transform(n, c, htile, wtile, 3, 3) =
            in_hold(n, c, htile, wtile, 1, 1) - in_hold(n, c, htile, wtile, 1, 3) -
            in_hold(n, c, htile, wtile, 3, 1) + in_hold(n, c, htile, wtile, 3, 3);
Chao Liu's avatar
Chao Liu committed
336
337
338
    };

    auto f_wei_transform = [&](auto k, auto c) {
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
        wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
        wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
                                    0.5 * double(wei_kcyx(k, c, 0, 2));
        wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
                                    0.5 * double(wei_kcyx(k, c, 0, 2));
        wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));

        wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
                                    0.5 * double(wei_kcyx(k, c, 2, 0));
        wei_transform(k, c, 1, 1) =
            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
            0.25 * double(wei_kcyx(k, c, 2, 2));
        wei_transform(k, c, 1, 2) =
            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
            0.25 * double(wei_kcyx(k, c, 2, 2));
        wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
                                    0.5 * double(wei_kcyx(k, c, 2, 2));

        wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
                                    0.5 * double(wei_kcyx(k, c, 2, 0));
        wei_transform(k, c, 2, 1) =
            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
            0.25 * double(wei_kcyx(k, c, 2, 2));
        wei_transform(k, c, 2, 2) =
            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
            0.25 * double(wei_kcyx(k, c, 2, 2));
        wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
                                    0.5 * double(wei_kcyx(k, c, 2, 2));

        wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
        wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
                                    0.5 * double(wei_kcyx(k, c, 2, 2));
        wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
                                    0.5 * double(wei_kcyx(k, c, 2, 2));
        wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
Chao Liu's avatar
Chao Liu committed
394
395
    };

Chao Liu's avatar
Chao Liu committed
396
397
    auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
        for(int j = 0; j < HiPerTile; ++j)
Chao Liu's avatar
Chao Liu committed
398
        {
Chao Liu's avatar
Chao Liu committed
399
            for(int i = 0; i < WiPerTile; ++i)
Chao Liu's avatar
Chao Liu committed
400
401
402
403
            {
                double v = 0;
                for(int c = 0; c < C; ++c)
                {
Chao Liu's avatar
Chao Liu committed
404
                    v += in_transform(n, c, htile, wtile, j, i) * wei_transform(k, c, j, i);
Chao Liu's avatar
Chao Liu committed
405
406
                }

Chao Liu's avatar
Chao Liu committed
407
                out_transform(n, k, htile, wtile, j, i) = v;
Chao Liu's avatar
Chao Liu committed
408
409
410
411
            }
        }
    };

Chao Liu's avatar
Chao Liu committed
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
    auto f_out_hold = [&](auto n, auto k, auto htile, auto wtile) {
        out_hold(n, k, htile, wtile, 0, 0) =
            out_transform(n, k, htile, wtile, 0, 0) + out_transform(n, k, htile, wtile, 0, 1) +
            out_transform(n, k, htile, wtile, 0, 2) + out_transform(n, k, htile, wtile, 1, 0) +
            out_transform(n, k, htile, wtile, 1, 1) + out_transform(n, k, htile, wtile, 1, 2) +
            out_transform(n, k, htile, wtile, 2, 0) + out_transform(n, k, htile, wtile, 2, 1) +
            out_transform(n, k, htile, wtile, 2, 2);
        out_hold(n, k, htile, wtile, 0, 1) =
            out_transform(n, k, htile, wtile, 0, 1) - out_transform(n, k, htile, wtile, 0, 2) -
            out_transform(n, k, htile, wtile, 0, 3) + out_transform(n, k, htile, wtile, 1, 1) -
            out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 1, 3) +
            out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
            out_transform(n, k, htile, wtile, 2, 3);
        out_hold(n, k, htile, wtile, 1, 0) =
            out_transform(n, k, htile, wtile, 1, 0) + out_transform(n, k, htile, wtile, 1, 1) +
            out_transform(n, k, htile, wtile, 1, 2) - out_transform(n, k, htile, wtile, 2, 0) -
            out_transform(n, k, htile, wtile, 2, 1) - out_transform(n, k, htile, wtile, 2, 2) -
            out_transform(n, k, htile, wtile, 3, 0) - out_transform(n, k, htile, wtile, 3, 1) -
            out_transform(n, k, htile, wtile, 3, 2);
        out_hold(n, k, htile, wtile, 1, 1) =
            out_transform(n, k, htile, wtile, 1, 1) - out_transform(n, k, htile, wtile, 1, 2) -
            out_transform(n, k, htile, wtile, 1, 3) - out_transform(n, k, htile, wtile, 2, 1) +
            out_transform(n, k, htile, wtile, 2, 2) + out_transform(n, k, htile, wtile, 2, 3) -
            out_transform(n, k, htile, wtile, 3, 1) + out_transform(n, k, htile, wtile, 3, 2) +
            out_transform(n, k, htile, wtile, 3, 3);
Chao Liu's avatar
Chao Liu committed
437
438
    };

Chao Liu's avatar
Chao Liu committed
439
440
    auto f_out = [&](auto n, auto k, auto htile, auto wtile) {
        for(int j = 0; j < HoPerTile; ++j)
Chao Liu's avatar
Chao Liu committed
441
        {
Chao Liu's avatar
Chao Liu committed
442
443
            std::size_t ho = HoPerTile * htile + j;
            for(int i = 0; i < WoPerTile; ++i)
Chao Liu's avatar
Chao Liu committed
444
            {
445
                std::size_t wo = WoPerTile * wtile + i;
446
                out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
Chao Liu's avatar
Chao Liu committed
447
448
449
450
451
452
            }
        }
    };

    std::size_t num_thread = std::thread::hardware_concurrency();

Chao Liu's avatar
Chao Liu committed
453
454
    make_ParallelTensorFunctor(f_in_hold, N, C, HTile, WTile)(num_thread);
    make_ParallelTensorFunctor(f_in_transform, N, C, HTile, WTile)(num_thread);
Chao Liu's avatar
Chao Liu committed
455
    make_ParallelTensorFunctor(f_wei_transform, K, C)(num_thread);
Chao Liu's avatar
Chao Liu committed
456
457
458
    make_ParallelTensorFunctor(f_out_transform, N, K, HTile, WTile)(num_thread);
    make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
    make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
Chao Liu's avatar
Chao Liu committed
459
460
461
462
463
464
}

template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
    float error     = 0;
Chao Liu's avatar
Chao Liu committed
465
    float max_diff  = -1;
Chao Liu's avatar
Chao Liu committed
466
467
468
    float ref_value = 0, result_value = 0;
    for(int i = 0; i < ref.mData.size(); ++i)
    {
469
470
        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
Chao Liu's avatar
Chao Liu committed
471
472
473
474
475
476
477
478
479
480
481
482
        if(max_diff < diff)
        {
            max_diff     = diff;
            ref_value    = ref.mData[i];
            result_value = result.mData[i];
        }
    }

    std::cout << "error: " << error << std::endl;
    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}

Chao Liu's avatar
Chao Liu committed
483
int main(int argc, char* argv[])
Chao Liu's avatar
Chao Liu committed
484
{
Jing Zhang's avatar
Jing Zhang committed
485
486
    constexpr index_t HStride = 2;
    constexpr index_t WStride = 2;
Jing Zhang's avatar
Jing Zhang committed
487

Jing Zhang's avatar
Jing Zhang committed
488
489
490
491
492
    constexpr index_t HDilation = 1;
    constexpr index_t WDilation = 1;

    constexpr index_t Direction = 2; //1: Forward; 2:Backward
#if 1
Chao Liu's avatar
Chao Liu committed
493
    constexpr index_t N  = 8;
Jing Zhang's avatar
Jing Zhang committed
494
495
496
    constexpr index_t C  = 128;
    constexpr index_t HI = 16;
    constexpr index_t WI = 16;
Chao Liu's avatar
Chao Liu committed
497
    constexpr index_t K  = 128;
Jing Zhang's avatar
Jing Zhang committed
498
499
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;
Chao Liu's avatar
Chao Liu committed
500
501
502

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
503
#elif 0
504
    // 3x3, 34x34
505
506
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;
507
508
    constexpr index_t HI = 34;
    constexpr index_t WI = 34;
509
510
511
    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
Chao Liu's avatar
Chao Liu committed
512
513
514

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
Chao Liu's avatar
Chao Liu committed
515
#elif 0
516
    // 3x3, 56x56
Chao Liu's avatar
Chao Liu committed
517
518
    constexpr index_t N  = 64;
    constexpr index_t C  = 64;
519
520
    constexpr index_t HI = 56;
    constexpr index_t WI = 56;
Chao Liu's avatar
Chao Liu committed
521
522
523
    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
Chao Liu's avatar
Chao Liu committed
524
525
526

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
Jing Zhang's avatar
Jing Zhang committed
527
#elif 0
Chao Liu's avatar
Chao Liu committed
528
529
530
531
532
    // 3x3 filter, 28x28 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 256;
    constexpr index_t HI = 28;
    constexpr index_t WI = 28;
533
    constexpr index_t K  = 128;
Chao Liu's avatar
Chao Liu committed
534
535
536
537
538
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
Jing Zhang's avatar
Jing Zhang committed
539
#elif 1
Chao Liu's avatar
Chao Liu committed
540
    // 1x1 filter, 28x28 image
541
542
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;
Chao Liu's avatar
Chao Liu committed
543
544
545
546
547
548
549
550
    constexpr index_t HI = 28;
    constexpr index_t WI = 28;
    constexpr index_t K  = 512;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
551
552
#elif 0
    // 3x3 filter, 20x84 image, 1x1 padding
Chao Liu's avatar
Chao Liu committed
553
554
555
556
557
558
559
560
561
562
    constexpr index_t N  = 16;
    constexpr index_t C  = 256;
    constexpr index_t HI = 20;
    constexpr index_t WI = 84;
    constexpr index_t K  = 256;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;

    constexpr index_t HPad = 1;
    constexpr index_t WPad = 1;
Chao Liu's avatar
Chao Liu committed
563
564
#elif 0
    // 3x3 filter, 112x112 image, 1x1 padding
Chao Liu's avatar
Chao Liu committed
565
566
567
568
569
570
571
572
573
574
    constexpr index_t N  = 16;
    constexpr index_t C  = 64;
    constexpr index_t HI = 112;
    constexpr index_t WI = 112;
    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;

    constexpr index_t HPad = 1;
    constexpr index_t WPad = 1;
575
#elif 0
576
577
578
579
580
581
582
583
584
585
586
    // 5x5 filter, 20x86 image
    constexpr index_t N  = 16;
    constexpr index_t C  = 256;
    constexpr index_t HI = 20;
    constexpr index_t WI = 86;
    constexpr index_t K  = 512;
    constexpr index_t Y  = 5;
    constexpr index_t X  = 5;

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
Chao Liu's avatar
Chao Liu committed
587
588
#elif 0
    // 5x5 filter, 20x86 image, 1x1 padding
Chao Liu's avatar
Chao Liu committed
589
590
591
592
593
594
595
596
597
598
    constexpr index_t N  = 16;
    constexpr index_t C  = 256;
    constexpr index_t HI = 20;
    constexpr index_t WI = 86;
    constexpr index_t K  = 512;
    constexpr index_t Y  = 5;
    constexpr index_t X  = 5;

    constexpr index_t HPad = 1;
    constexpr index_t WPad = 1;
Chao Liu's avatar
Chao Liu committed
599
600
#elif 0
    // 5x5 filter, 28x28 image, 2x2 padding
Chao Liu's avatar
Chao Liu committed
601
602
603
604
605
606
607
608
609
610
    constexpr index_t N  = 16;
    constexpr index_t C  = 192;
    constexpr index_t HI = 28;
    constexpr index_t WI = 28;
    constexpr index_t K  = 32;
    constexpr index_t Y  = 5;
    constexpr index_t X  = 5;

    constexpr index_t HPad = 2;
    constexpr index_t WPad = 2;
Chao Liu's avatar
Chao Liu committed
611
#elif 0
612
    // 3x3 filter, 14x14 image
Chao Liu's avatar
Chao Liu committed
613
    constexpr index_t N  = 128;
614
    constexpr index_t C  = 256;
Chao Liu's avatar
Chao Liu committed
615
616
    constexpr index_t HI = 14;
    constexpr index_t WI = 14;
617
618
619
    constexpr index_t K  = 128;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;
Chao Liu's avatar
Chao Liu committed
620
621
622

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
Chao Liu's avatar
Chao Liu committed
623
#elif 0
624
    // 1x1 filter, 14x14 image
Chao Liu's avatar
Chao Liu committed
625
626
627
628
629
630
631
632
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;
    constexpr index_t HI = 14;
    constexpr index_t WI = 14;
    constexpr index_t K  = 512;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

Chao Liu's avatar
Chao Liu committed
633
634
635
636
637
638
639
640
641
642
643
644
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
#elif 0
    // 1x1 filter, 7x7 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;
    constexpr index_t HI = 7;
    constexpr index_t WI = 7;
    constexpr index_t K  = 2048;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

645
646
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
Chao Liu's avatar
Chao Liu committed
647
#elif 0
648
649
    // 1x1 filter, 73x73 image
    constexpr index_t N  = 128;
Chao Liu's avatar
Chao Liu committed
650
    constexpr index_t C  = 512;
651
652
653
654
655
656
    constexpr index_t HI = 73;
    constexpr index_t WI = 73;
    constexpr index_t K  = 128;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

Chao Liu's avatar
Chao Liu committed
657
658
    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
Chao Liu's avatar
Chao Liu committed
659
#endif
Chao Liu's avatar
Chao Liu committed
660

661
662
663
    auto lower_pads = Sequence<HPad, WPad>{};
    auto upper_pads = Sequence<HPad, WPad>{};

Jing Zhang's avatar
Jing Zhang committed
664
665
    auto strides   = Sequence<HStride, WStride>{};
    auto dilations = Sequence<HDilation, WDilation>{};
Jing Zhang's avatar
Jing Zhang committed
666

Chao Liu's avatar
Chao Liu committed
667
668
    auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
    auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
Jing Zhang's avatar
Jing Zhang committed
669
670
    auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
        in_nchw_desc, wei_kcyx_desc, strides, dilations);
Chao Liu's avatar
Chao Liu committed
671

Jing Zhang's avatar
Jing Zhang committed
672
673
    auto wei_ckyx_back_desc = wei_kcyx_desc.ReorderGivenNew2Old(Sequence<1, 0, 2, 3>{});

Chao Liu's avatar
Chao Liu committed
674
    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
Chao Liu's avatar
Chao Liu committed
675
    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
Chao Liu's avatar
Chao Liu committed
676
    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
Chao Liu's avatar
Chao Liu committed
677

Chao Liu's avatar
Chao Liu committed
678
679
    using in_data_t  = float;
    using out_data_t = float;
680
    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Jing Zhang's avatar
Jing Zhang committed
681
682
    Tensor<in_data_t> out_nkhw(make_TensorDescriptor(out_nkhw_desc));
    Tensor<in_data_t> in_nchw_device(make_TensorDescriptor(in_nchw_desc));
683
    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
Jing Zhang's avatar
Jing Zhang committed
684
    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
Chao Liu's avatar
Chao Liu committed
685

Chao Liu's avatar
Chao Liu committed
686
    std::size_t num_thread = std::thread::hardware_concurrency();
Chao Liu's avatar
Chao Liu committed
687

Chao Liu's avatar
Chao Liu committed
688
689
690
691
692
693
694
    if(argc != 3)
    {
        printf("arg1: do_verification, arg2: nrepeat\n");
        exit(1);
    }

    bool do_verification = atoi(argv[1]);
Chao Liu's avatar
Chao Liu committed
695
    index_t nrepeat      = atoi(argv[2]);
696
697
698

    if(do_verification)
    {
Chao Liu's avatar
Chao Liu committed
699
#if 0
700
        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
Chao Liu's avatar
Chao Liu committed
701
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
Chao Liu's avatar
Chao Liu committed
702
703
704
#elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
705
706
707
#elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
Chao Liu's avatar
Chao Liu committed
708
#elif 1
709
        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
Jing Zhang's avatar
Jing Zhang committed
710
        out_nkhw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
Chao Liu's avatar
Chao Liu committed
711
        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
Chao Liu's avatar
Chao Liu committed
712
#elif 0
713
714
715
716
717
718
        in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);

        auto gen_wei = [](auto... is) {
            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
        };
        wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
Chao Liu's avatar
Chao Liu committed
719
#endif
720
    }
Chao Liu's avatar
Chao Liu committed
721

Chao Liu's avatar
Chao Liu committed
722
#if 1
Chao Liu's avatar
Chao Liu committed
723
#if 0
Chao Liu's avatar
Chao Liu committed
724
    device_direct_convolution_1
Chao Liu's avatar
Chao Liu committed
725
#elif 0
Chao Liu's avatar
Chao Liu committed
726
    device_convolution_direct_v2_nchw_kcyx_nkhw
Chao Liu's avatar
Chao Liu committed
727
#elif 0
Chao Liu's avatar
Chao Liu committed
728
    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
Chao Liu's avatar
Chao Liu committed
729
#elif 0
730
    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
Chao Liu's avatar
Chao Liu committed
731
#elif 0
732
    device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
Chao Liu's avatar
Chao Liu committed
733
#elif 0
Chao Liu's avatar
Chao Liu committed
734
    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
735
#elif 0
Chao Liu's avatar
Chao Liu committed
736
    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
Chao Liu's avatar
Chao Liu committed
737
#elif 0
Chao Liu's avatar
Chao Liu committed
738
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
739
740
#elif 1
    device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
741
#endif
Jing Zhang's avatar
Jing Zhang committed
742
743
744
    (out_nkhw_desc,
     out_nkhw,
     wei_ckyx_back_desc,
Jing Zhang's avatar
Jing Zhang committed
745
     wei_kcyx,
Jing Zhang's avatar
Jing Zhang committed
746
     in_nchw_desc,
Jing Zhang's avatar
Jing Zhang committed
747
     strides,
Jing Zhang's avatar
Jing Zhang committed
748
     dilations,
Jing Zhang's avatar
Jing Zhang committed
749
750
751
     in_nchw_device,
     nrepeat
     );
752

Chao Liu's avatar
Chao Liu committed
753
#elif 1
Chao Liu's avatar
Chao Liu committed
754
    device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc,
Chao Liu's avatar
Chao Liu committed
755
                                                             in_nchw,
Chao Liu's avatar
Chao Liu committed
756
757
                                                             wei_kcyx_desc,
                                                             wei_kcyx,
Chao Liu's avatar
Chao Liu committed
758
759
760
761
762
                                                             out_nkhw_desc,
                                                             out_nkhw_device,
                                                             lower_pads,
                                                             upper_pads,
                                                             nrepeat);
763
#endif
Chao Liu's avatar
Chao Liu committed
764

765
    if(do_verification)
766
    {
Jing Zhang's avatar
Jing Zhang committed
767
#if 0
Chao Liu's avatar
Chao Liu committed
768
        if(Y == 3 && X == 3)
769
        {
Chao Liu's avatar
Chao Liu committed
770
            host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
771
772
        }
        else
Chao Liu's avatar
Chao Liu committed
773
#endif
Jing Zhang's avatar
Jing Zhang committed
774
775
776
777
778
779
780
        if(Direction == 1)
        {
            host_direct_convolution_forw(
                in_nchw, wei_kcyx, out_nkhw, lower_pads, upper_pads, strides, dilations);
            check_error(out_nkhw, out_nkhw_device);
        }
        else
781
        {
Jing Zhang's avatar
Jing Zhang committed
782
783
784
            host_direct_convolution_back(
                in_nchw, wei_kcyx, out_nkhw, lower_pads, upper_pads, strides, dilations);
            check_error(in_nchw, in_nchw_device);
785
        }
Chao Liu's avatar
Chao Liu committed
786

Chao Liu's avatar
Chao Liu committed
787
#if 0
788
        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
Chao Liu's avatar
Chao Liu committed
789
        LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
790
791
        LogRange(std::cout << "out_nkhw_host  : ", out_nkhw_host.mData, ",") << std::endl;
        LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
Chao Liu's avatar
Chao Liu committed
792
#endif
793
    }
794
}