lowering.cpp 32.2 KB
Newer Older
Paul's avatar
Paul committed
1

Paul's avatar
Paul committed
2
3
4
#include <migraphx/cpu/lowering.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/dfor.hpp>
5
#include <migraphx/op/batch_norm_inference.hpp>
Paul's avatar
Paul committed
6
#include <migraphx/op/convolution.hpp>
kahmed10's avatar
kahmed10 committed
7
#include <migraphx/op/deconvolution.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
8
#include <migraphx/op/quant_convolution.hpp>
Paul's avatar
Paul committed
9
#include <migraphx/op/dot.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
10
#include <migraphx/op/quant_dot.hpp>
Paul's avatar
Paul committed
11
12
13
14
15
16
17
18
#include <migraphx/op/elu.hpp>
#include <migraphx/op/im2col.hpp>
#include <migraphx/op/leaky_relu.hpp>
#include <migraphx/op/logsoftmax.hpp>
#include <migraphx/op/lrn.hpp>
#include <migraphx/op/pad.hpp>
#include <migraphx/op/pooling.hpp>
#include <migraphx/op/softmax.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
19
20
#include <migraphx/op/argmax.hpp>
#include <migraphx/op/argmin.hpp>
Shucai Xiao's avatar
Shucai Xiao committed
21
#include <migraphx/op/rnn_var_sl_last_output.hpp>
Paul's avatar
Paul committed
22
23
#include <migraphx/shape_for_each.hpp>
#include <migraphx/iterator_for.hpp>
Paul's avatar
Paul committed
24
#include <migraphx/par_dfor.hpp>
25
#include <migraphx/clamp.hpp>
Paul's avatar
Paul committed
26
#include <migraphx/cpu/gemm.hpp>
27
#include <migraphx/register_op.hpp>
Paul's avatar
Paul committed
28
#include <unordered_map>
Paul's avatar
Paul committed
29
#include <utility>
kahmed10's avatar
kahmed10 committed
30
#include <iostream>
Paul's avatar
Paul committed
31

Paul's avatar
Paul committed
32
namespace migraphx {
Paul's avatar
Paul committed
33
inline namespace MIGRAPHX_INLINE_NS {
Paul's avatar
Paul committed
34
35
36
37
38
39
40
41
namespace cpu {

template <typename T>
T zero(const T&)
{
    return T(0);
}

Khalique's avatar
Khalique committed
42
43
44
45
template <class T>
typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::enable_if<true, T>>::
    type
    make_signed(T x)
Khalique's avatar
Khalique committed
46
47
48
49
{
    return x;
}

50
51
52
53
//
// cpu implemenataion of batch norm for inference
//
// inputs are:
54
55
56
57
// args[0] -> input data buffer
// args[1] -> mini batch mean
// args[2] -> mini batch variance
// args[3] -> gamma
Aditya Atluri's avatar
Aditya Atluri committed
58
// args[4] -> bias
59
60
61
//
// The equation to compute batch norm for inference is:
//
Aditya Atluri's avatar
Aditya Atluri committed
62
// output[i] = bias + gamma * (input[i] + mean) / sqrt(variance + epsilon)
63
64
65
66
67
//
// the input data format should be nchw
//
struct cpu_batch_norm_inference
{
68
    op::batch_norm_inference op;
69

70
71
72
73
74
75
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

76
77
    std::string name() const { return "cpu::batch_norm_inference"; }

Paul's avatar
Paul committed
78
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
79

Paul's avatar
Paul committed
80
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
81
    {
82
83
        argument output{output_shape};

Aditya Atluri's avatar
Aditya Atluri committed
84
85
        double epsilon           = op.epsilon;
        auto input               = args[0];
Paul's avatar
Paul committed
86
87
88
89
        auto arg_gamma           = args[1];
        auto arg_bias            = args[2];
        auto mini_batch_mean     = args[3];
        auto mini_batch_variance = args[4];
90

91
        if(op.bn_mode == op::batch_norm_inference::spatial)
Scott Thornton's avatar
Scott Thornton committed
92
93
94
        {
            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
Shucai Xiao's avatar
Shucai Xiao committed
95
96
97
98
99
100
101
102
                    par_for(output_shape.elements(), [&](auto i) {
                        auto idx = output_shape.multi(i);
                        auto c   = idx[1];
                        assert((variance[c] + epsilon) > 0);
                        result[i] =
                            gamma[c] * (buffer[i] - mean[c]) / std::sqrt(variance[c] + epsilon) +
                            bias[c];
                    });
Scott Thornton's avatar
Scott Thornton committed
103
                });
104
105
        }

106
        if(op.bn_mode == op::batch_norm_inference::per_activation)
Scott Thornton's avatar
Scott Thornton committed
107
        {
Shucai Xiao's avatar
Shucai Xiao committed
108
            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
109
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
Shucai Xiao's avatar
Shucai Xiao committed
110
111
112
113
114
115
116
117
118
119
                    par_for(output_shape.elements(), [&](auto i) {
                        auto idx   = output_shape.multi(i);
                        idx[0]     = 0;
                        auto index = output_shape.index(idx);

                        assert((variance[index] + epsilon) > 0);
                        result[i] = gamma[index] * (buffer[i] - mean[index]) /
                                        std::sqrt(variance[index] + epsilon) +
                                    bias[index];
                    });
Scott Thornton's avatar
Scott Thornton committed
120
                });
121
        }
122
123
124
125

        return output;
    }
};
126
MIGRAPHX_REGISTER_OP(cpu_batch_norm_inference)
127

Khalique's avatar
Khalique committed
128
struct cpu_lrn
Khalique's avatar
Khalique committed
129
{
Khalique's avatar
Khalique committed
130
    op::lrn op;
Khalique's avatar
Khalique committed
131

132
133
134
135
136
137
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

Khalique's avatar
Khalique committed
138
    std::string name() const { return "cpu::lrn"; }
Khalique's avatar
Khalique committed
139
140
141
142
143
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        visit_all(result, args[0])([&](auto output, auto input) {
Khalique's avatar
Khalique committed
144
145
146
147
            int n_batch         = output_shape.lens()[0];
            int channels        = output_shape.lens()[1];
            int height          = output_shape.lens()[2];
            int width           = output_shape.lens()[3];
Paul's avatar
Paul committed
148
            float alphaoverarea = op.alpha / float(op.size);
149
150
            int radius_lower    = (op.size - 1) / 2;
            int radius_upper    = op.size / 2 + 1;
Khalique's avatar
Khalique committed
151

152
            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
Khalique's avatar
Khalique committed
153
                float scale = 0;
Khalique's avatar
Khalique committed
154
                dfor(channels)([&](int c) {
155
156
                    auto start = (c - radius_lower) < 0 ? 0 : (c - radius_lower);
                    auto end   = (c + radius_upper) > channels ? channels : (c + radius_upper);
Khalique's avatar
Khalique committed
157
158
                    for(auto k = start; k < end; ++k)
                    {
Khalique's avatar
Khalique committed
159
                        scale += std::pow(input(b, k, h, w), 2);
Khalique's avatar
Khalique committed
160
161
162
                    }
                    scale *= alphaoverarea;
                    scale += op.bias;
Khalique's avatar
Khalique committed
163
                    scale              = std::pow(scale, -op.beta);
Khalique's avatar
Khalique committed
164
165
166
167
168
169
170
                    output(b, c, h, w) = input(b, c, h, w) * scale;
                });
            });
        });
        return result;
    }
};
171
MIGRAPHX_REGISTER_OP(cpu_lrn)
Khalique's avatar
Khalique committed
172

Paul Fultz II's avatar
Paul Fultz II committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
template <class V, class T, class... Ts>
void visit_quantize_impl(V&& v, T&& x, Ts&&... xs)
{
    x.visit([&](auto y) { visit_all(xs...)([&](auto... ys) { v(y, ys...); }); });
}

template <class T, class... Ts>
auto visit_quantize(T&& x, Ts&&... xs)
{
    return [&](auto v) {
        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70100
        visit_quantize_impl(v, x, xs...);
    };
}

188
template <class Op>
189
struct cpu_convolution : auto_register_op<cpu_convolution<Op>>
Paul's avatar
Paul committed
190
{
191
192
193
194
    cpu_convolution() = default;

    cpu_convolution(Op pop) : op(std::move(pop)) {}

195
    Op op;
196

197
198
199
200
201
202
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

203
    std::string name() const { return "cpu::" + op.name(); }
204
205
206
207
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
Paul Fultz II's avatar
Paul Fultz II committed
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
        visit_quantize(result, args[0], args[1])([&](auto output, auto input, auto weights) {
            auto in_lens = input.get_shape().lens();

            auto wei_lens = weights.get_shape().lens();
            auto wei_n    = wei_lens[0];
            auto wei_c    = wei_lens[1];
            std::vector<std::size_t> win_size(wei_lens.begin() + 1, wei_lens.end());

            par_for(output_shape.elements(), [&](auto i) {
                auto idx_o = output_shape.multi(i);
                auto w     = idx_o[1];
                auto n_dim = idx_o.size();

                std::vector<std::ptrdiff_t> win_start;
                for(std::size_t dim = 2; dim < n_dim; ++dim)
                {
                    auto d_2 = dim - 2;
                    win_start.push_back(std::ptrdiff_t(idx_o[dim] * op.stride[d_2]) -
                                        std::ptrdiff_t(op.padding[d_2]));
                }
                const auto group_id = w / (wei_n / op.group);

                shape win_shape{output_shape.type(), win_size};

                double acc = 0.0;
                shape_for_each(win_shape, [&](auto idx_win) {
                    auto k           = idx_win[0];
                    const auto in_ch = group_id * wei_c + k;
                    std::vector<std::ptrdiff_t> idx(idx_o.begin(), idx_o.end());
                    idx[1] = in_ch;
                    std::transform(idx_win.begin() + 1,
                                   idx_win.end(),
                                   win_start.begin(),
                                   idx.begin() + 2,
                                   [](std::ptrdiff_t ii, std::ptrdiff_t jj) { return ii + jj; });
                    std::vector<std::ptrdiff_t> idx_wei(idx_o.size());
                    idx_wei[0] = w;
                    std::copy(idx_win.begin(), idx_win.end(), idx_wei.begin() + 1);
                    if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
                       std::equal(idx.begin(),
                                  idx.end(),
                                  in_lens.begin(),
                                  in_lens.end(),
                                  std::less<std::ptrdiff_t>{}))
                    {
                        acc +=
                            input(idx.begin(), idx.end()) * weights(idx_wei.begin(), idx_wei.end());
                    }
                });

                output[i] = acc;
259
            });
260
261
262
263
264
        });
        return result;
    }
};

kahmed10's avatar
kahmed10 committed
265
template <class Op>
266
struct cpu_deconvolution : auto_register_op<cpu_deconvolution<Op>>
kahmed10's avatar
kahmed10 committed
267
{
268
269
270
271
    cpu_deconvolution() = default;

    cpu_deconvolution(Op pop) : op(std::move(pop)) {}

kahmed10's avatar
kahmed10 committed
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
    Op op;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
            using type = typename decltype(output)::value_type;

            std::fill(output.begin(), output.end(), type{0});

kahmed10's avatar
kahmed10 committed
290
291
292
            auto in_lens = input.get_shape().lens();
            auto in_n    = in_lens[0];
            auto in_c    = in_lens[1];
kahmed10's avatar
kahmed10 committed
293
294
295
296

            auto wei   = weights.get_shape().lens();
            auto wei_n = wei[0];
            auto wei_c = wei[1];
kahmed10's avatar
kahmed10 committed
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350

            auto out_lens = output_shape.lens();
            auto kdims    = op.kdims();

            std::vector<std::size_t> win_size{in_c};
            std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
            std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
            shape win_shape{output_shape.type(), win_size};

            par_dfor(in_n, wei_c)([&](int o, int k) {

                shape_for_each(win_shape, [&](auto idx_win) {
                    const int w = idx_win[0];

                    auto input_dims_start = idx_win.begin() + 1;
                    auto wei_dims_start   = idx_win.begin() + kdims + 1;

                    std::vector<std::ptrdiff_t> win_start;
                    for(std::size_t n = 0; n < kdims; ++n)
                    {
                        win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * op.stride[n]) -
                                            std::ptrdiff_t(op.padding[n]));
                    }

                    const int group_id = w / (wei_n / op.group);
                    const int in_ch    = group_id * wei_c + k;

                    std::vector<std::ptrdiff_t> idx_out{o, in_ch};

                    for(size_t n = 0; n < kdims; n++)
                    {
                        idx_out.push_back(win_start[n] + *(wei_dims_start + n) * op.dilation[n]);
                    }

                    std::vector<std::ptrdiff_t> idx_wei{w, k};
                    std::copy(wei_dims_start, idx_win.end(), std::back_inserter(idx_wei));

                    std::vector<std::ptrdiff_t> idx_in{o, w};
                    std::copy(input_dims_start, wei_dims_start, std::back_inserter(idx_in));

                    if(std::all_of(
                           idx_out.begin() + 2, idx_out.end(), [&](auto ii) { return ii >= 0; }) and
                       std::equal(idx_out.begin() + 2,
                                  idx_out.end(),
                                  out_lens.begin() + 2,
                                  out_lens.end(),
                                  std::less<std::ptrdiff_t>{}))
                    {
                        output(idx_out.begin(), idx_out.end()) +=
                            input(idx_in.begin(), idx_in.end()) *
                            weights(idx_wei.begin(), idx_wei.end());
                    }
                });

kahmed10's avatar
kahmed10 committed
351
            });
kahmed10's avatar
kahmed10 committed
352

kahmed10's avatar
kahmed10 committed
353
354
355
356
357
        });
        return result;
    }
};

Scott Thornton's avatar
Scott Thornton committed
358
359
struct cpu_im2col
{
360
    op::im2col op;
Scott Thornton's avatar
Scott Thornton committed
361

362
363
364
365
366
367
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

Scott Thornton's avatar
Scott Thornton committed
368
369
    static std::string name() { return "cpu::im2col"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
Scott Thornton's avatar
Scott Thornton committed
370

wsttiger's avatar
wsttiger committed
371
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
Scott Thornton's avatar
Scott Thornton committed
372
    {
Scott Thornton's avatar
Scott Thornton committed
373
        argument result{output_shape};
Scott Thornton's avatar
Scott Thornton committed
374
        auto input_shape   = args[0].get_shape();
Scott Thornton's avatar
Scott Thornton committed
375
376
        auto weights_shape = args[1].get_shape();
        visit_all(result, args[0])([&](auto col, auto input) {
Scott Thornton's avatar
Scott Thornton committed
377
378
            const std::size_t& height   = input_shape.lens()[2];
            const std::size_t& width    = input_shape.lens()[3];
Scott Thornton's avatar
Scott Thornton committed
379
380
381
            const std::size_t& channels = weights_shape.lens()[1];
            const std::size_t& kernel_h = weights_shape.lens()[2];
            const std::size_t& kernel_w = weights_shape.lens()[3];
Scott Thornton's avatar
Scott Thornton committed
382
383
            const std::size_t& pad_h    = op.padding[0];
            const std::size_t& pad_w    = op.padding[1];
Scott Thornton's avatar
Scott Thornton committed
384
385
386
            const std::size_t& stride_h = op.stride[0];
            const std::size_t& stride_w = op.stride[1];

Paul's avatar
Paul committed
387
388
            long kdiv2_h = long(kernel_h) / 2;
            long kdiv2_w = long(kernel_w) / 2;
Scott Thornton's avatar
Scott Thornton committed
389
            // calculate output sizes
Scott Thornton's avatar
Scott Thornton committed
390
391
            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
wsttiger's avatar
wsttiger committed
392
            // account for padding for the starting position of the input pixels
Paul's avatar
Paul committed
393
            long iinput = kdiv2_h - long(pad_h);
wsttiger's avatar
wsttiger committed
394
            // loop over output pixels (ioutput, joutput)
Scott Thornton's avatar
Scott Thornton committed
395
396
            for(std::size_t ioutput = 0; ioutput < col_height; ioutput++, iinput += stride_h)
            {
Paul's avatar
Paul committed
397
                long jinput = kdiv2_w - long(pad_w);
Scott Thornton's avatar
Scott Thornton committed
398
399
400
401
402
                for(std::size_t joutput = 0; joutput < col_width; joutput++, jinput += stride_w)
                {
                    // compute linear index for output
                    std::size_t ldx = ioutput * col_width + joutput;
                    std::size_t p   = 0;
wsttiger's avatar
wsttiger committed
403
404
405
                    dfor(channels,
                         kernel_h,
                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
Paul's avatar
Paul committed
406
407
                        auto idx    = iinput + long(koffset) - kdiv2_h;
                        auto jdx    = jinput + long(loffset) - kdiv2_w;
wsttiger's avatar
wsttiger committed
408
409
410
411
412
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
                                          ? input(0, c, idx, jdx)
                                          : 0;
                        p++;
                    });
Scott Thornton's avatar
Scott Thornton committed
413
414
                }
            }
Scott Thornton's avatar
Scott Thornton committed
415
        });
Scott Thornton's avatar
Scott Thornton committed
416
417
418
        return result;
    }
};
419
MIGRAPHX_REGISTER_OP(cpu_im2col)
Scott Thornton's avatar
Scott Thornton committed
420

Paul's avatar
Paul committed
421
422
423
struct max_pool
{
    static std::string name() { return "max"; }
Shucai Xiao's avatar
Shucai Xiao committed
424
425
426
427
428
    template <class T>
    static T start()
    {
        return std::numeric_limits<T>::lowest();
    }
Paul's avatar
Paul committed
429
430
431
432
433
434
435

    static double apply(double x, double y)
    {
        double m = std::max(x, y);
        return (m);
    }

Shucai Xiao's avatar
Shucai Xiao committed
436
    static double final(double x, std::size_t) { return (x); }
Paul's avatar
Paul committed
437
438
439
440
441
};

struct avg_pool
{
    static std::string name() { return "average"; }
Shucai Xiao's avatar
Shucai Xiao committed
442
443
444
445
446
447

    template <class T>
    static double start()
    {
        return 0.0;
    }
Paul's avatar
Paul committed
448
449
450

    static double apply(double x, double y) { return x + y; }

Shucai Xiao's avatar
Shucai Xiao committed
451
    static double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
Paul's avatar
Paul committed
452
453
454
};

template <class Op>
455
struct cpu_pooling : auto_register_op<cpu_pooling<Op>>
Paul's avatar
Paul committed
456
{
457
458
459
460
    cpu_pooling() = default;

    cpu_pooling(op::pooling pop) : op(std::move(pop)) {}

461
    op::pooling op;
Paul's avatar
Paul committed
462

463
464
465
466
467
468
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

Paul's avatar
Paul committed
469
    std::string name() const { return "cpu::pooling_" + Op::name(); }
Paul's avatar
Paul committed
470
471
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
Paul's avatar
Paul committed
472
473
474
    {
        argument result{output_shape};
        visit_all(result, args[0])([&](auto output, auto input) {
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
            using type   = typename decltype(output)::value_type;
            auto in_s    = input.get_shape();
            auto in_lens = in_s.lens();
            std::vector<std::size_t> vec_len(in_lens.begin() + 2, in_lens.end());

            par_for(output_shape.elements(), [&](auto i) {
                auto idx_o = output_shape.multi(i);
                auto n_dim = idx_o.size();
                std::vector<std::size_t> win_start;
                std::vector<std::size_t> win_size;
                for(std::size_t dim = 2; dim < n_dim; ++dim)
                {
                    auto d_2  = dim - 2;
                    int start = static_cast<int>(idx_o[dim] * op.stride[d_2]) -
                                static_cast<int>(op.padding[d_2]);
                    int end = std::min(start + op.lengths[d_2], in_lens[dim]);
                    start   = std::max(start, 0);
                    win_start.push_back(start);
                    win_size.push_back(end - start);
                }

                shape win_shape{output_shape.type(), win_size};
                auto pool_size = win_shape.elements();
Shucai Xiao's avatar
Shucai Xiao committed
498
                double acc     = Op::template start<type>();
499
500
501
502
503
504
505
506
507
508
509
510
                shape_for_each(win_shape, [&](auto idx_w) {
                    auto idx = idx_o;
                    std::transform(idx_w.begin(),
                                   idx_w.end(),
                                   win_start.begin(),
                                   idx.begin() + 2,
                                   [](auto ii, auto jj) { return ii + jj; });
                    if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
                       idx < in_lens)
                    {
                        acc = Op::apply(acc, input[in_s.index(idx)]);
                    }
Paul's avatar
Paul committed
511
                });
512
513
514

                output[i] = type(Op::final(acc, pool_size));
            });
Paul's avatar
Paul committed
515
        });
516

Paul's avatar
Paul committed
517
518
519
520
        return result;
    }
};

521
struct cpu_op
Paul's avatar
Paul committed
522
{
523
    operation op;
kahmed10's avatar
kahmed10 committed
524
525
526
527
528
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
529
    std::string name() const { return "cpu::op"; }
Paul's avatar
Paul committed
530
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
Paul's avatar
Paul committed
531
    argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
Paul's avatar
Paul committed
532
    {
Paul's avatar
Paul committed
533
        return op.compute(output_shape, args);
Paul's avatar
Paul committed
534
    }
535
    friend std::ostream& operator<<(std::ostream& os, const cpu_op& x)
Paul's avatar
Paul committed
536
    {
537
538
        os << "cpu::" << x.op;
        return os;
Paul's avatar
Paul committed
539
540
    }
};
541
MIGRAPHX_REGISTER_OP(cpu_op)
Paul's avatar
Paul committed
542

Khalique's avatar
Khalique committed
543
struct cpu_pad
544
{
Khalique's avatar
Khalique committed
545
    op::pad op;
546
547
548
549
550
551
552

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

kahmed10's avatar
kahmed10 committed
553
    std::string name() const { return "cpu::pad"; }
554
555
556
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
Khalique's avatar
Khalique committed
557
        assert(output_shape.standard());
558
        argument result{output_shape};
559
560
561
562
        result.visit([&](auto output) {
            using type = typename decltype(output)::value_type;
            std::fill(output.begin(), output.end(), pad_clamp<type>(op.value));
        });
Khalique's avatar
Khalique committed
563
564

        visit_all(result, args[0])([&](auto output, auto input) {
565
            shape_for_each(input.get_shape(), [&](const auto& idx) {
Khalique's avatar
Khalique committed
566
                std::vector<std::size_t> new_idx(idx.size());
Khalique's avatar
Khalique committed
567
568
569
570
                std::transform(
                    idx.begin(), idx.end(), op.pads.begin(), new_idx.begin(), [](auto i, auto j) {
                        return i + j;
                    });
Khalique's avatar
Khalique committed
571
                output(new_idx.begin(), new_idx.end()) = input(idx.begin(), idx.end());
572
            });
Khalique's avatar
Khalique committed
573
574
        });

575
576
577
        return result;
    }
};
578
MIGRAPHX_REGISTER_OP(cpu_pad)
579

Paul's avatar
Paul committed
580
581
struct cpu_gemm
{
Shucai Xiao's avatar
Shucai Xiao committed
582
    op::dot op;
583
584
585
586
587
588

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }
Shucai Xiao's avatar
Shucai Xiao committed
589
    std::string name() const { return "cpu::dot"; }
Shucai Xiao's avatar
Shucai Xiao committed
590
591
    shape compute_shape(const std::vector<shape>& inputs) const
    {
Shucai Xiao's avatar
Shucai Xiao committed
592
593
594
595
596
        if(inputs.size() == 3)
        {
            auto c_shape = inputs.at(2);
            check_shapes{{c_shape}}.not_broadcasted();
        }
Shucai Xiao's avatar
Shucai Xiao committed
597
        return op.compute_shape(inputs);
Shucai Xiao's avatar
Shucai Xiao committed
598
    }
Paul's avatar
Paul committed
599

Paul's avatar
Paul committed
600
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
Paul's avatar
Paul committed
601
602
    {
        argument result{output_shape};
Shucai Xiao's avatar
Shucai Xiao committed
603
        // 3 inputs, it is alpha * A * B + beta * C, then
604
        // A and B are matrices, and C is of the same shape as A * B
Shucai Xiao's avatar
Shucai Xiao committed
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
        if(args.size() == 3)
        {
            // no need to consider the value of args[2]
            if(op.beta == 0.0f)
            {
                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
            }
            else
            {
                visit_all(result, args[2])([&](auto output, auto input) {
                    std::copy(input.begin(), input.end(), output.begin());
                });
            }

            migemm(result, args[0], args[1], op.alpha, op.beta);

            return result;
        }

        // 2 input arguments
        migemm(result, args[0], args[1], op.alpha, 0.0f);

Paul's avatar
Paul committed
627
628
629
        return result;
    }
};
630
MIGRAPHX_REGISTER_OP(cpu_gemm)
Paul's avatar
Paul committed
631

632
633
634
struct cpu_quant_gemm
{
    op::quant_dot op;
635
636
637
638
639
640
641

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
    std::string name() const { return "cpu::quant_dot"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        if(inputs.size() == 3)
        {
            auto c_shape = inputs.at(2);
            check_shapes{{c_shape}}.not_broadcasted();
        }
        return op.compute_shape(inputs);
    }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        // 3 inputs, it is alpha * A * B + beta * C, then
        // A and B are matrices, and C is of the same shape to A * B

        // first, convert the args[0] and args[1] from int8_t to int32_t
        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
        arg_0.visit([&](auto output) {
Shucai Xiao's avatar
Shucai Xiao committed
663
664
            args.at(0).visit(
                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
665
666
667
        });

        arg_1.visit([&](auto output) {
Shucai Xiao's avatar
Shucai Xiao committed
668
669
            args.at(1).visit(
                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
        });

        if(args.size() == 3)
        {
            // no need to consider the value of args[2]
            if(op.beta == 0)
            {
                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
            }
            else
            {
                visit_all(result, args[2])([&](auto output, auto input) {
                    std::copy(input.begin(), input.end(), output.begin());
                });
            }

            migemm(result, arg_0, arg_1, op.alpha, op.beta);

            return result;
        }

        // 2 input arguments
692
        migemm(result, arg_0, arg_1, op.alpha, int32_t{0});
693
694
695
696

        return result;
    }
};
697
MIGRAPHX_REGISTER_OP(cpu_gemm)
698

Khalique's avatar
Khalique committed
699
700
701
702
703
704
struct leaky_relu_op
{
    op::leaky_relu op;
    std::string name() const { return "cpu::leaky_relu"; }
    auto fcn() const
    {
Paul's avatar
Paul committed
705
        auto a = op.alpha;
Khalique's avatar
Khalique committed
706
707
708
709
        return [a](auto x) { return x > 0 ? x : x * a; };
    }
};

Khalique's avatar
Khalique committed
710
711
712
713
714
715
struct elu_op
{
    op::elu op;
    std::string name() const { return "cpu::elu"; }
    auto fcn() const
    {
Paul's avatar
Paul committed
716
        auto a = op.alpha;
Khalique's avatar
Khalique committed
717
718
719
720
        return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
    }
};

Paul's avatar
Paul committed
721
template <typename Op>
722
struct cpu_unary : auto_register_op<cpu_unary<Op>>
Paul's avatar
Paul committed
723
{
724
725
726
727
728
729
730
    cpu_unary() = default;

    template <class T>
    cpu_unary(T pop) : op(Op{std::move(pop)})
    {
    }

Paul's avatar
Paul committed
731
    Op op;
732
733
734
735
736
737

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op.op, f);
    }
Paul's avatar
Paul committed
738
    std::string name() const { return op.name(); }
Shucai Xiao's avatar
Shucai Xiao committed
739
    shape compute_shape(const std::vector<shape>& inputs) const
740
    {
Shucai Xiao's avatar
Shucai Xiao committed
741
742
        check_shapes{inputs}.has(1);
        auto s = inputs.at(0);
743
        return {s.type(), s.lens()};
744
745
    }

Paul's avatar
Paul committed
746
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
Paul's avatar
Paul committed
747
748
    {
        argument result{output_shape};
749
750
751
        visit_all(result, args[0])([&](auto output, auto input) {
            assert(input.get_shape().standard());
            std::transform(input.begin(), input.end(), output.begin(), op.fcn());
Paul's avatar
Paul committed
752
        });
753

Paul's avatar
Paul committed
754
755
756
757
        return result;
    }
};

758
template <class Op>
759
struct cpu_softmax : auto_register_op<cpu_softmax<Op>>
Paul's avatar
Paul committed
760
{
761
762
763
764
    cpu_softmax() = default;

    cpu_softmax(Op pop) : op(std::move(pop)) {}

765
    Op op;
Khalique's avatar
Khalique committed
766
767
768
769
770
771
772

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

773
    std::string name() const { return "cpu::" + op.name(); }
Khalique's avatar
Khalique committed
774
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
Paul's avatar
Paul committed
775
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
Paul's avatar
Paul committed
776
777
    {
        argument result{output_shape};
778
779
780
781
        auto batch_lens    = output_shape.lens();
        int64_t tuned_axis = (op.axis < 0) ? op.axis + args[0].get_shape().lens().size() : op.axis;
        std::size_t n_dims = batch_lens[tuned_axis];
        batch_lens[tuned_axis] = 1;
782
783
        shape batch_shape{shape::int32_type, batch_lens};

Paul's avatar
Paul committed
784
785
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(input)::value_type;
Shucai Xiao's avatar
Shucai Xiao committed
786
787
            std::vector<value_type> batch_max(batch_shape.elements(),
                                              std::numeric_limits<value_type>::lowest());
788
789
            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
            par_for(batch_shape.elements(), [&](auto i) {
790
                auto idx = batch_shape.multi(i);
Shucai Xiao's avatar
Shucai Xiao committed
791
                for(std::size_t j = 0; j < n_dims; ++j)
792
                {
793
794
                    idx[tuned_axis] = j;
                    batch_max[i]    = std::max(batch_max[i], input(idx.begin(), idx.end()));
795
                }
Khalique's avatar
Khalique committed
796

Shucai Xiao's avatar
Shucai Xiao committed
797
                for(std::size_t j = 0; j < n_dims; ++j)
798
                {
799
                    idx[tuned_axis]   = j;
Shucai Xiao's avatar
Shucai Xiao committed
800
801
                    std::size_t index = output_shape.index(idx);
                    output[index]     = std::exp(input[index] - batch_max[i]);
802
                }
Khalique's avatar
Khalique committed
803

Shucai Xiao's avatar
Shucai Xiao committed
804
                for(std::size_t j = 0; j < n_dims; ++j)
805
                {
806
                    idx[tuned_axis] = j;
807
808
                    batch_sum[i] += output(idx.begin(), idx.end());
                }
Khalique's avatar
Khalique committed
809

Shucai Xiao's avatar
Shucai Xiao committed
810
                for(std::size_t j = 0; j < n_dims; ++j)
811
                {
812
                    idx[tuned_axis] = j;
813
814
                    output(idx.begin(), idx.end()) =
                        op.output()(output(idx.begin(), idx.end()), batch_sum[i]);
815
                }
Shucai Xiao's avatar
Shucai Xiao committed
816
817
818
819
820
821
822
            });
        });

        return result;
    }
};

Shucai Xiao's avatar
Shucai Xiao committed
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
struct cpu_rnn_var_sl_last_output
{
    op::rnn_var_sl_last_output op;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return migraphx::reflect(self.op, f);
    }

    std::string name() const { return "cpu::rnn_var_sl_last_output"; }

    shape compute_shape(std::vector<shape> inputs) const
    {
        return op.compute_shape(std::move(inputs));
    }

    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto out_comp_lens = args[0].get_shape().lens();
        out_comp_lens[0]   = 1;
        shape out_comp_s{output_shape.type(), out_comp_lens};

        visit_all(result, args[0])([&](auto output, auto input) {
            args[1].visit([&](auto seq_lens) {
                par_for(output_shape.elements(), [&](auto i) {
                    auto idx = out_comp_s.multi(i);
                    auto b   = idx[2];
                    if(op.direction == op::rnn_direction::reverse or idx[1] == 1)
                    {
                        idx[0] = 0;
                    }
                    else
                    {
                        idx[0] = seq_lens[b] - 1;
                    }
                    output[i] = input(idx.begin(), idx.end());
                });
            });
        });

        return result;
    }
};
868
MIGRAPHX_REGISTER_OP(cpu_rnn_var_sl_last_output)
Shucai Xiao's avatar
Shucai Xiao committed
869

Paul's avatar
Paul committed
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
struct cpu_apply
{
    program* prog;
    std::unordered_map<std::string, std::function<void(instruction_ref)>> apply_map{};

    template <class T>
    auto simple_op()
    {
        return [this](instruction_ref ins) { apply_simple_op<T>(ins); };
    }

    template <class T, class Op>
    auto extend_op()
    {
        return [this](instruction_ref ins) { apply_extend_op<T, Op>(ins); };
    }

    void init()
    {
Aditya Atluri's avatar
Aditya Atluri committed
889
        apply_map["batch_norm_inference"] =
890
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
891
        apply_map["convolution"] = extend_op<cpu_convolution<op::convolution>, op::convolution>();
kahmed10's avatar
kahmed10 committed
892
893
894
895
        apply_map["deconvolution"] =
            extend_op<cpu_deconvolution<op::deconvolution>, op::deconvolution>();
        apply_map["dot"]       = extend_op<cpu_gemm, op::dot>();
        apply_map["quant_dot"] = extend_op<cpu_quant_gemm, op::quant_dot>();
896
897
898
899
900
901
902
903
904
        apply_map["quant_convolution"] =
            extend_op<cpu_convolution<op::quant_convolution>, op::quant_convolution>();
        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
        apply_map["im2col"]     = extend_op<cpu_im2col, op::im2col>();
        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
        apply_map["logsoftmax"] = extend_op<cpu_softmax<op::logsoftmax>, op::logsoftmax>();
        apply_map["lrn"]        = extend_op<cpu_lrn, op::lrn>();
        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
        apply_map["softmax"]    = extend_op<cpu_softmax<op::softmax>, op::softmax>();
Shucai Xiao's avatar
Shucai Xiao committed
905
906
        apply_map["rnn_var_sl_last_output"] =
            extend_op<cpu_rnn_var_sl_last_output, op::rnn_var_sl_last_output>();
Paul's avatar
Paul committed
907
908
909
910
911
912
913
    }

    void apply()
    {
        init();
        for(auto it : iterator_for(*prog))
        {
Khalique's avatar
Khalique committed
914
            if(it->name() == "pooling")
Paul's avatar
Paul committed
915
916
917
            {
                apply_pooling(it);
            }
Paul's avatar
Paul committed
918
            else if(apply_map.count(it->name()) > 0)
Paul's avatar
Paul committed
919
            {
Paul's avatar
Paul committed
920
                apply_map.at(it->name())(it);
Paul's avatar
Paul committed
921
            }
Paul's avatar
Paul committed
922
            else if(is_context_free(it->get_operator()))
923
924
925
            {
                apply_cpu_op(it);
            }
Paul's avatar
Paul committed
926
927
928
        }
    }

929
930
931
932
933
    void apply_cpu_op(instruction_ref ins)
    {
        prog->replace_instruction(ins, cpu_op{ins->get_operator()}, ins->inputs());
    }

Paul's avatar
Paul committed
934
935
936
    template <class T>
    void apply_simple_op(instruction_ref ins)
    {
Paul's avatar
Paul committed
937
        prog->replace_instruction(ins, T{}, ins->inputs());
Paul's avatar
Paul committed
938
939
940
941
942
    }

    template <class T, class Op>
    void apply_extend_op(instruction_ref ins)
    {
943
        auto&& op = any_cast<Op>(ins->get_operator());
Paul's avatar
Paul committed
944
        prog->replace_instruction(ins, T{op}, ins->inputs());
Paul's avatar
Paul committed
945
946
947
948
    }

    void apply_pooling(instruction_ref ins)
    {
949
        auto&& op = any_cast<op::pooling>(ins->get_operator());
Paul's avatar
Paul committed
950
        if(op.mode == "max")
Paul's avatar
Paul committed
951
            prog->replace_instruction(ins, cpu_pooling<max_pool>{op}, ins->inputs());
Paul's avatar
Paul committed
952
        else if(op.mode == "average")
Paul's avatar
Paul committed
953
            prog->replace_instruction(ins, cpu_pooling<avg_pool>{op}, ins->inputs());
Paul's avatar
Paul committed
954
955
956
    }
};

Shucai Xiao's avatar
Shucai Xiao committed
957
void lowering::apply(program& p) const { cpu_apply{&p}.apply(); }
Paul's avatar
Paul committed
958
959

} // namespace cpu
Paul's avatar
Paul committed
960
} // namespace MIGRAPHX_INLINE_NS
Paul's avatar
Paul committed
961
} // namespace migraphx