Cpu conv softmax simplify impl (#397)

* simplify cpu implementation of the convolution, softmax, and logsoftmax * clang format * fix cppcheck error * improve code coverage

Cpu conv softmax simplify impl (#397)
* simplify cpu implementation of the convolution, softmax, and logsoftmax * clang format * fix cppcheck error * improve code coverage
522da00b · Shucai Xiao · mvermeulen · 1f4f6d44 · 522da00b · 522da00b
Commit 522da00b authored Oct 25, 2019 by Shucai Xiao Committed by mvermeulen Oct 25, 2019
6 changed files
--- a/src/include/migraphx/op/logsoftmax.hpp
+++ b/src/include/migraphx/op/logsoftmax.hpp
@@ -30,6 +30,11 @@ struct logsoftmax
        }
        return inputs.at(0);
    }
+    auto output() const
+    {
+        return [=](auto x, auto y) { return std::log(x / y); };
+    }
 };
 } // namespace op

--- a/src/include/migraphx/op/softmax.hpp
+++ b/src/include/migraphx/op/softmax.hpp
@@ -30,6 +30,11 @@ struct softmax
        }
        return inputs.at(0);
    }
+    auto output() const
+    {
+        return [=](auto x, auto y) { return x / y; };
+    }
 };
 } // namespace op

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -166,61 +166,10 @@ struct cpu_lrn
    }
 };
+template <class Op>
 struct cpu_convolution
 {
-    op::convolution op;
+    Op op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "cpu::convolution"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
-            auto in   = input.get_shape().lens();
-            auto in_h = in[2];
-            auto in_w = in[3];
-            auto wei   = weights.get_shape().lens();
-            auto wei_n = wei[0];
-            auto wei_c = wei[1];
-            auto wei_h = wei[2];
-            auto wei_w = wei[3];
-            par_dfor(output_shape.lens()[0],
-                     output_shape.lens()[1],
-                     output_shape.lens()[2],
-                     output_shape.lens()[3])(
-                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const auto start_x  = i * op.stride[0] - op.padding[0];
-                    const auto start_y  = j * op.stride[1] - op.padding[1];
-                    const auto group_id = w / (wei_n / op.group);
-                    double acc = 0;
-                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const auto in_x  = start_x + x;
-                        const auto in_y  = start_y + y;
-                        const auto in_ch = group_id * wei_c + k;
-                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
-                        {
-                            acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
-                        }
-                    });
-                    output(o, w, i, j) = acc;
-                });
-        });
-        return result;
-    }
-};
-struct cpu_quant_convolution
-{
-    op::quant_convolution op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -228,47 +177,45 @@ struct cpu_quant_convolution
        return migraphx::reflect(self.op, f);
    }
-    std::string name() const { return "cpu::quant_convolution"; }
+    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        auto output = result.get<int32_t>();
+        result.visit([&](auto output) {
-        visit_all(args[0], args[1])([&](auto input, auto weights) {
+            using type = typename decltype(output)::value_type;
-            auto in   = input.get_shape().lens();
+            visit_all(args[0], args[1])([&](auto input, auto weights) {
-            auto in_h = in[2];
+                auto in   = input.get_shape().lens();
-            auto in_w = in[3];
+                auto in_h = in[2];
+                auto in_w = in[3];
-            auto wei   = weights.get_shape().lens();
-            auto wei_n = wei[0];
+                auto wei   = weights.get_shape().lens();
-            auto wei_c = wei[1];
+                auto wei_n = wei[0];
-            auto wei_h = wei[2];
+                auto wei_c = wei[1];
-            auto wei_w = wei[3];
+                auto wei_h = wei[2];
+                auto wei_w = wei[3];
-            par_dfor(output_shape.lens()[0],
-                     output_shape.lens()[1],
+                par_dfor(output_shape.lens()[0],
-                     output_shape.lens()[2],
+                         output_shape.lens()[1],
-                     output_shape.lens()[3])(
+                         output_shape.lens()[2],
-                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
+                         output_shape.lens()[3])(
-                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                        const auto start_x  = i * op.stride[0] - op.padding[0];
-                    const auto group_id = w / (wei_n / op.group);
+                        const auto start_y  = j * op.stride[1] - op.padding[1];
+                        const auto group_id = w / (wei_n / op.group);
-                    int32_t acc = 0;
-                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
+                        type acc = type{0};
-                        const auto in_x  = start_x + x;
+                        dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const auto in_y  = start_y + y;
+                            const auto in_x  = start_x + x;
-                        const auto in_ch = group_id * wei_c + k;
+                            const auto in_y  = start_y + y;
-                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
+                            const auto in_ch = group_id * wei_c + k;
-                        {
+                            if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
-                            acc += static_cast<int32_t>(input(o, in_ch, in_x, in_y)) *
+                                acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
-                                   weights(w, k, x, y);
+                        });
-                        }
+                        output(o, w, i, j) = acc;
                    });
-                    output(o, w, i, j) = acc;
+            });
-                });
        });
        return result;
    }
 };
@@ -620,41 +567,25 @@ struct cpu_unary
    {
        check_shapes{inputs}.has(1);
        auto s = inputs.at(0);
-        if(s.packed())
+        return {s.type(), s.lens()};
-        {
-            return s;
-        }
-        else
-        {
-            return {s.type(), s.lens()};
-        }
    }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        result.visit([&](auto output) {
+        visit_all(result, args[0])([&](auto output, auto input) {
-            args[0].visit([&](auto input) {
+            assert(input.get_shape().standard());
-                if(input.get_shape().standard())
+            std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-                {
-                    std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-                }
-                else
-                {
-                    shape_for_each(output.get_shape(), [&](const auto& idx) {
-                        output(idx.begin(), idx.end()) = op.fcn()(input(idx.begin(), idx.end()));
-                    });
-                }
-            });
        });
        return result;
    }
 };
+template <class Op>
 struct cpu_softmax
 {
-    op::softmax op;
+    Op op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -662,7 +593,7 @@ struct cpu_softmax
        return migraphx::reflect(self.op, f);
    }
-    std::string name() const { return "cpu::softmax"; }
+    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
@@ -701,70 +632,8 @@ struct cpu_softmax
                for(std::size_t j = 0; j < n_dims; ++j)
                {
                    idx[op.axis] = j;
-                    output(idx.begin(), idx.end()) /= batch_sum[i];
+                    output(idx.begin(), idx.end()) =
-                }
+                        op.output()(output(idx.begin(), idx.end()), batch_sum[i]);
-            });
-        });
-        return result;
-    }
-};
-struct cpu_logsoftmax
-{
-    op::logsoftmax op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "cpu::logsoftmax"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        auto batch_lens     = output_shape.lens();
-        std::size_t n_dims  = batch_lens[op.axis];
-        batch_lens[op.axis] = 1;
-        shape batch_shape{shape::int32_type, batch_lens};
-        // use a parallel implementation to acheive better performance
-        // one thread for one batch
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using value_type = typename decltype(input)::value_type;
-            std::vector<value_type> batch_max(batch_shape.elements(),
-                                              std::numeric_limits<value_type>::lowest());
-            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
-            par_for(batch_shape.elements(), [&](auto i) {
-                auto idx = batch_shape.multi(i);
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    idx[op.axis] = j;
-                    batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
-                }
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    idx[op.axis]      = j;
-                    std::size_t index = output_shape.index(idx);
-                    output[index]     = input[index] - batch_max[i];
-                }
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    idx[op.axis] = j;
-                    batch_sum[i] += std::exp(output(idx.begin(), idx.end()));
-                }
-                batch_sum[i] = std::log(batch_sum[i]);
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    idx[op.axis] = j;
-                    output(idx.begin(), idx.end()) -= batch_sum[i];
                }
            });
        });
@@ -794,17 +663,18 @@ struct cpu_apply
    {
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["convolution"]       = extend_op<cpu_convolution, op::convolution>();
+        apply_map["convolution"] = extend_op<cpu_convolution<op::convolution>, op::convolution>();
-        apply_map["dot"]               = extend_op<cpu_gemm, op::dot>();
+        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
-        apply_map["quant_dot"]         = extend_op<cpu_quant_gemm, op::quant_dot>();
+        apply_map["quant_dot"]   = extend_op<cpu_quant_gemm, op::quant_dot>();
-        apply_map["quant_convolution"] = extend_op<cpu_quant_convolution, op::quant_convolution>();
+        apply_map["quant_convolution"] =
-        apply_map["elu"]               = extend_op<cpu_unary<elu_op>, op::elu>();
+            extend_op<cpu_convolution<op::quant_convolution>, op::quant_convolution>();
-        apply_map["im2col"]            = extend_op<cpu_im2col, op::im2col>();
+        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["leaky_relu"]        = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["im2col"]     = extend_op<cpu_im2col, op::im2col>();
-        apply_map["logsoftmax"]        = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["lrn"]               = extend_op<cpu_lrn, op::lrn>();
+        apply_map["logsoftmax"] = extend_op<cpu_softmax<op::logsoftmax>, op::logsoftmax>();
-        apply_map["pad"]               = extend_op<cpu_pad, op::pad>();
+        apply_map["lrn"]        = extend_op<cpu_lrn, op::lrn>();
-        apply_map["softmax"]           = extend_op<cpu_softmax, op::softmax>();
+        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]    = extend_op<cpu_softmax<op::softmax>, op::softmax>();
    }
    void apply()

--- a/src/targets/gpu/include/migraphx/gpu/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/softmax.hpp
@@ -24,26 +24,6 @@ namespace gpu {
 struct context;
-struct miopen_softmax
-{
-    op::softmax op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "miopen::softmax"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
 struct hip_softmax
 {
    op::softmax op;

--- a/src/targets/gpu/softmax.cpp
+++ b/src/targets/gpu/softmax.cpp
@@ -6,31 +6,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-shape miopen_softmax::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(2).standard();
-    return op.compute_shape({inputs.at(0)});
-}
-argument miopen_softmax::compute(context& ctx,
-                                 const shape& output_shape,
-                                 const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenSoftmaxForward(ctx.get_stream().get_miopen(),
-                         &alpha,
-                         x_desc.get(),
-                         args[0].implicit(),
-                         &beta,
-                         y_desc.get(),
-                         args[1].implicit());
-    return args[1];
-}
 shape hip_softmax::compute_shape(const std::vector<shape>& inputs) const
 {
    check_shapes{inputs, *this}.has(2).standard();

--- a/test/cpu_dot_op_test.cpp
+++ b/test/cpu_dot_op_test.cpp
@@ -1329,6 +1329,33 @@ TEST_CASE(quant_dot_3args_general)
        EXPECT(migraphx::verify_range(m, gold));
    }
+    {
+        migraphx::program p;
+        migraphx::shape m1_shape{migraphx::shape::int8_type, {3, 4}};
+        migraphx::shape m2_shape{migraphx::shape::int8_type, {4, 5}};
+        migraphx::shape m3_shape{migraphx::shape::int32_type, {3, 5}};
+        std::vector<int8_t> data1(3 * 4);
+        std::vector<int8_t> data2(4 * 5);
+        std::vector<int> data3(3 * 5);
+        std::iota(data1.begin(), data1.end(), 0);
+        std::iota(data2.begin(), data2.end(), 0);
+        std::iota(data3.begin(), data3.end(), 0);
+        auto l1 = p.add_literal(migraphx::literal{m1_shape, data1});
+        auto l2 = p.add_literal(migraphx::literal{m2_shape, data2});
+        auto l3 = p.add_literal(migraphx::literal{m3_shape, data3});
+        p.add_instruction(migraphx::op::quant_dot{1, 0}, l1, l2, l3);
+        std::vector<int> gold = {
+            70, 76, 82, 88, 94, 190, 212, 234, 256, 278, 310, 348, 386, 424, 462};
+        p.compile(migraphx::cpu::target{});
+        auto result = p.eval({});
+        std::vector<float> m;
+        result.visit([&](auto output) { m.assign(output.begin(), output.end()); });
+        EXPECT(migraphx::verify_range(m, gold));
+    }
    {
        migraphx::program p;
        migraphx::shape m1_shape{migraphx::shape::int8_type, {8, 2}};