merge develop branch changes

be5f3539 · Shucai Xiao · 7e3bdc34 · ebfe9735 · be5f3539 · be5f3539
Commit be5f3539 authored Jul 09, 2019 by Shucai Xiao
20 changed files
--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/gpu/device/softmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -12,69 +13,44 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument softmax(hipStream_t stream,
-                 const migraphx::shape& output_shape,
-                 std::vector<migraphx::argument> args,
-                 int axis)
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
 {
-    auto lens        = output_shape.lens();
-    auto batch_lens  = lens;
-    size_t n_dims    = lens[axis];
-    batch_lens[axis] = 1;
-    migraphx::shape batch_shape{shape::int32_type, batch_lens};
-
-    visit_all(args.back(), args.front())([&](auto output, auto input) {
-        const auto* input_ptr = device_cast(input.data());
-        auto* output_ptr      = device_cast(output.data());
-        visit_tensor_size(batch_shape.lens().size(), [&](auto n_dim) {
-            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
-            hip_tensor_descriptor<n_dim> desc_data(output_shape);
-
-            // each thread is for one item in the batch
-            gs_launch(stream, batch_shape.elements())([=](auto i) {
-                auto batch_idx = desc_batch.multi(i);
-                auto data_idx  = batch_idx;
-                // get max
-                auto batch_max = input_ptr[desc_data.linear(batch_idx)];
-                for(std::size_t j = 1; j < n_dims; ++j)
-                {
+    auto lens                  = result.get_shape().lens();
+    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
+    batch_lens[axis]           = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
+
+    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
                    data_idx[axis] = j;
-                    batch_max      = std::max(to_hip_type(batch_max),
-                                         to_hip_type(input_ptr[desc_data.linear(data_idx)]));
-                }
-
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = input_ptr[idx] - batch_max;
-                }
+                    return input[data_idx];
+                });

-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = exp(to_hip_type(output_ptr[idx]));
-                }
-
-                auto batch_sum = output_ptr[desc_data.linear(batch_idx)];
-                for(std::size_t j = 1; j < n_dims; ++j)
-                {
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
                    data_idx[axis] = j;
-                    batch_sum += output_ptr[desc_data.linear(data_idx)];
-                }
-
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = output_ptr[idx] / batch_sum;
-                }
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
+
+            idx.local_stride(batch_item_num, [&](auto j) {
+                data_idx[axis]   = j;
+                auto val         = input[data_idx] - batch_max;
+                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
            });
        });
    });
-
-    return args.back();
 }

 } // namespace device

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
--- a/src/targets/gpu/gather.cpp
+++ b/src/targets/gpu/gather.cpp
@@ -12,11 +12,9 @@ shape hip_gather::compute_shape(std::vector<shape> inputs) const
    return op.compute_shape(inputs);
 }

-argument hip_gather::compute(context& ctx,
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
+argument hip_gather::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::gather(ctx.get_stream().get(), output_shape, args, op.axis);
+    return device::gather(ctx.get_stream().get(), args.back(), args[0], args[1], op.axis);
 }

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/argmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_argmax
+{
+    op::argmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::argmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/argmin.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/erf.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/erf.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/reduce_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/reduce_sum.hpp
--- a/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
--- a/src/targets/gpu/include/migraphx/gpu/erf.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/erf.hpp
--- a/src/targets/gpu/include/migraphx/gpu/reduce_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/reduce_sum.hpp
--- a/src/targets/gpu/include/migraphx/gpu/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/softmax.hpp
@@ -34,7 +34,7 @@ struct miopen_softmax
        return migraphx::reflect(self.op, f);
    }

-    std::string name() const { return "gpu::softmax"; }
+    std::string name() const { return "miopen::softmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;

--- a/src/targets/gpu/logsoftmax.cpp
+++ b/src/targets/gpu/logsoftmax.cpp
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
--- a/src/targets/gpu/reduce_sum.cpp
+++ b/src/targets/gpu/reduce_sum.cpp
--- a/src/targets/gpu/softmax.cpp
+++ b/src/targets/gpu/softmax.cpp