Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into rnn_optimization

b8090620 · Shucai Xiao · c2db3b96 · 3540f1b9 · b8090620 · b8090620
Commit b8090620 authored Jun 10, 2019 by Shucai Xiao
20 changed files
--- a/src/targets/gpu/device/clip.cpp
+++ b/src/targets/gpu/device/clip.cpp
+#include <migraphx/gpu/device/clip.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void clip(hipStream_t stream,
+          const argument& result,
+          const argument& arg1,
+          const float max,
+          const float min)
+{
+    nary(stream, result, arg1)(
+        [max, min](auto x) { return std::min<decltype(x)>(std::max<decltype(x)>(min, x), max); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/convert.cpp
+++ b/src/targets/gpu/device/convert.cpp
+#include <migraphx/gpu/device/convert.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void convert(hipStream_t stream, const argument& result, const argument& arg)
+{
+    result.visit([&](auto output) {
+        arg.visit([&](auto input) {
+            const auto* input_ptr = device_cast(input.data());
+            auto* output_ptr      = device_cast(output.data());
+            gs_launch(stream,
+                      result.get_shape().elements())([=](auto i) { output_ptr[i] = input_ptr[i]; });
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
@@ -17,47 +17,56 @@ argument logsoftmax(hipStream_t stream,
                    int axis)
 {

-    auto lens              = output_shape.lens();
-    std::size_t batch_size = std::accumulate(
-        lens.begin(), lens.begin() + axis, std::size_t{1}, std::multiplies<std::size_t>());
-    std::size_t n_dims = std::accumulate(
-        lens.begin() + axis, lens.end(), std::size_t{1}, std::multiplies<std::size_t>());
-    migraphx::shape comp_shape{output_shape.type(), {batch_size, n_dims}};
+    auto lens         = output_shape.lens();
+    auto num_in_batch = lens[axis];
+    auto batch_lens   = lens;
+    batch_lens[axis]  = 1;
+    migraphx::shape batch_shape{output_shape.type(), batch_lens};

    visit_all(args.back(), args.front())([&](auto output, auto input) {
        const auto* input_ptr = device_cast(input.data());
        auto* output_ptr      = device_cast(output.data());
+        visit_tensor_size(batch_shape.lens().size(), [&](auto n_dim) {
+            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
+            hip_tensor_descriptor<n_dim> desc_data(output_shape);

-        // each thread is for one item in the batch
-        gs_launch(stream, batch_size)([=](auto i) {
-            std::size_t row_start = i * n_dims;
-            // get max
-            auto batch_max = input_ptr[row_start];
-            for(std::size_t j = 1; j < n_dims; ++j)
-            {
-                auto ind  = row_start + j;
-                batch_max = std::max(to_hip_type(batch_max), to_hip_type(input_ptr[ind]));
-            }
+            // each thread is for one item in the batch
+            gs_launch(stream, batch_shape.elements())([=](auto i) {
+                auto batch_idx = desc_batch.multi(i);
+                auto data_idx  = batch_idx;

-            for(std::size_t j = 0; j < n_dims; ++j)
-            {
-                auto ind        = row_start + j;
-                output_ptr[ind] = input_ptr[ind] - batch_max;
-            }
+                // get max
+                auto batch_max = input_ptr[desc_data.linear(batch_idx)];
+                for(std::size_t j = 1; j < num_in_batch; ++j)
+                {
+                    data_idx[axis] = j;
+                    size_t idx     = desc_data.linear(data_idx);
+                    batch_max      = std::max(to_hip_type(batch_max), to_hip_type(input_ptr[idx]));
+                }

-            auto batch_sum = ::exp(to_hip_type(output_ptr[row_start]));
-            for(std::size_t j = 1; j < n_dims; ++j)
-            {
-                auto ind = row_start + j;
-                batch_sum += ::exp(to_hip_type(output_ptr[ind]));
-            }
-            batch_sum = ::log(to_hip_type(batch_sum));
+                for(std::size_t j = 0; j < num_in_batch; ++j)
+                {
+                    data_idx[axis]  = j;
+                    size_t idx      = desc_data.linear(data_idx);
+                    output_ptr[idx] = input_ptr[idx] - batch_max;
+                }

-            for(std::size_t j = 0; j < n_dims; ++j)
-            {
-                auto ind = row_start + j;
-                output_ptr[ind] -= batch_sum;
-            }
+                auto batch_sum = ::exp(to_hip_type(output_ptr[desc_data.linear(batch_idx)]));
+                for(std::size_t j = 1; j < num_in_batch; ++j)
+                {
+                    data_idx[axis] = j;
+                    size_t idx     = desc_data.linear(data_idx);
+                    batch_sum += ::exp(to_hip_type(output_ptr[idx]));
+                }
+                batch_sum = ::log(to_hip_type(batch_sum));
+
+                for(std::size_t j = 0; j < num_in_batch; ++j)
+                {
+                    data_idx[axis] = j;
+                    size_t idx     = desc_data.linear(data_idx);
+                    output_ptr[idx] -= batch_sum;
+                }
+            });
        });
    });


--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/softmax.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument softmax(hipStream_t stream,
+                 const migraphx::shape& output_shape,
+                 std::vector<migraphx::argument> args,
+                 int axis)
+{
+    auto lens        = output_shape.lens();
+    auto batch_lens  = lens;
+    size_t n_dims    = lens[axis];
+    batch_lens[axis] = 1;
+    migraphx::shape batch_shape{shape::int32_type, batch_lens};
+
+    visit_all(args.back(), args.front())([&](auto output, auto input) {
+        const auto* input_ptr = device_cast(input.data());
+        auto* output_ptr      = device_cast(output.data());
+        visit_tensor_size(batch_shape.lens().size(), [&](auto n_dim) {
+            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
+            hip_tensor_descriptor<n_dim> desc_data(output_shape);
+
+            // each thread is for one item in the batch
+            gs_launch(stream, batch_shape.elements())([=](auto i) {
+                auto batch_idx = desc_batch.multi(i);
+                auto data_idx  = batch_idx;
+                // get max
+                auto batch_max = input_ptr[desc_data.linear(batch_idx)];
+                for(std::size_t j = 1; j < n_dims; ++j)
+                {
+                    data_idx[axis] = j;
+                    batch_max      = std::max(to_hip_type(batch_max),
+                                         to_hip_type(input_ptr[desc_data.linear(data_idx)]));
+                }
+
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    data_idx[axis]  = j;
+                    auto idx        = desc_data.linear(data_idx);
+                    output_ptr[idx] = input_ptr[idx] - batch_max;
+                }
+
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    data_idx[axis]  = j;
+                    auto idx        = desc_data.linear(data_idx);
+                    output_ptr[idx] = exp(to_hip_type(output_ptr[idx]));
+                }
+
+                auto batch_sum = output_ptr[desc_data.linear(batch_idx)];
+                for(std::size_t j = 1; j < n_dims; ++j)
+                {
+                    data_idx[axis] = j;
+                    batch_sum += output_ptr[desc_data.linear(data_idx)];
+                }
+
+                for(std::size_t j = 0; j < n_dims; ++j)
+                {
+                    data_idx[axis]  = j;
+                    auto idx        = desc_data.linear(data_idx);
+                    output_ptr[idx] = output_ptr[idx] / batch_sum;
+                }
+            });
+        });
+    });
+
+    return args.back();
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/abs.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/abs.hpp
@@ -13,6 +13,13 @@ struct context;
 struct miopen_abs
 {
    shared<activation_descriptor> ad;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return gpu::reflect(self.ad.get(), f);
+    }
+
    std::string name() const { return "gpu::abs"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/batchnorm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/batchnorm.hpp
@@ -13,6 +13,13 @@ struct context;
 struct miopen_batch_norm_inference
 {
    op::batch_norm_inference op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::batch_norm_inference"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/clip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/clip.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/clip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_clip
+{
+    op::clip op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::clip"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/concat.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/concat.hpp
@@ -14,6 +14,12 @@ struct hip_concat
 {
    op::concat op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::concat"; }
    shape compute_shape(std::vector<shape> inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
@@ -13,6 +13,13 @@ struct context;
 struct miopen_contiguous
 {
    op::contiguous op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument compute(context&, shape output_shape, const std::vector<argument>& args) const;

--- a/src/targets/gpu/include/migraphx/gpu/convert.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convert.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONVERT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONVERT_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/convert.hpp>
+#include <migraphx/gpu/oper.hpp>
+#include <migraphx/gpu/device/convert.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_convert : unary_device<hip_convert, device::convert>
+{
+    op::convert op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    hip_convert(op::convert oper) : op(oper) {}
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        inputs.pop_back();
+        check_shapes{inputs}.packed();
+        return op.compute_shape(inputs);
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/clip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/clip.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_CLIP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_CLIP_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void clip(hipStream_t stream, const argument& result, const argument& arg1, float max, float min);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/convert.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/convert.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_CONVERT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_CONVERT_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void convert(hipStream_t stream, const argument& result, const argument& arg);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SOFTMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SOFTMAX_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument softmax(hipStream_t stream,
+                 const migraphx::shape& output_shape,
+                 std::vector<migraphx::argument> args,
+                 int axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/elu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/elu.hpp
@@ -13,6 +13,13 @@ struct context;
 struct miopen_elu
 {
    shared<activation_descriptor> ad;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return gpu::reflect(self.ad.get(), f);
+    }
+
    std::string name() const { return "gpu::elu"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gather.hpp
@@ -14,6 +14,13 @@ struct context;
 struct hip_gather
 {
    op::gather op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::gather"; }
    shape compute_shape(std::vector<shape> inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
@@ -13,6 +13,13 @@ struct context;
 struct miopen_gemm
 {
    op::dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::gemm"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -28,6 +28,13 @@ struct hip_allocate
 {
    shape s;
    std::string tag{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"), f(self.tag, "tag"));
+    }
+
    std::string name() const { return "hip::allocate"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
@@ -43,6 +50,13 @@ struct hip_allocate
 struct hip_sync
 {
    std::string tag{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.tag, "tag"));
+    }
+
    std::string name() const { return "hip::sync"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {

--- a/src/targets/gpu/include/migraphx/gpu/leaky_relu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/leaky_relu.hpp
@@ -13,6 +13,13 @@ struct context;
 struct miopen_leaky_relu
 {
    shared<activation_descriptor> ad;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return gpu::reflect(self.ad.get(), f);
+    }
+
    std::string name() const { return "gpu::leaky_relu"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/logsoftmax.hpp
@@ -25,6 +25,13 @@ namespace gpu {
 struct hip_logsoftmax
 {
    op::logsoftmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::logsoftmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument

--- a/src/targets/gpu/include/migraphx/gpu/lrn.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/lrn.hpp
@@ -13,6 +13,13 @@ struct context;
 struct miopen_lrn
 {
    shared<lrn_descriptor> ldesc;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return gpu::reflect(self.ldesc.get(), f);
+    }
+
    std::string name() const { return "gpu::lrn"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument