merge changes from develop branch

f06f6aa3 · Shucai Xiao · 80a35596 · ebfe9735 · f06f6aa3 · f06f6aa3
Commit f06f6aa3 authored Jul 09, 2019 by Shucai Xiao
20 changed files
--- a/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
+
+#include <migraphx/gpu/device/tensor_view.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class F>
+void visit_tensor_size(std::size_t n, F f)
+{
+    switch(n)
+    {
+    case 1:
+    {
+        f(std::integral_constant<std::size_t, 1>{});
+        break;
+    }
+    case 2:
+    {
+        f(std::integral_constant<std::size_t, 2>{});
+        break;
+    }
+    case 3:
+    {
+        f(std::integral_constant<std::size_t, 3>{});
+        break;
+    }
+    case 4:
+    {
+        f(std::integral_constant<std::size_t, 4>{});
+        break;
+    }
+    case 5:
+    {
+        f(std::integral_constant<std::size_t, 5>{});
+        break;
+    }
+    default: throw std::runtime_error("Unknown tensor size");
+    }
+}
+
+inline shape get_shape(const shape& x) { return x; }
+
+template <class T>
+auto get_shape(const T& x) -> decltype(x.get_shape())
+{
+    return x.get_shape();
+}
+
+template <class V, class F, class... Ts>
+void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
+{
+    std::initializer_list<migraphx::shape::type_t> types = {get_shape(xs).type()...};
+    if(!std::all_of(
+           types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
+        MIGRAPHX_THROW("Types must be the same");
+    std::initializer_list<std::size_t> ranks = {get_shape(xs).lens().size()...};
+    if(!std::all_of(
+           ranks.begin(), ranks.end(), [&](std::size_t r) { return r == s.lens().size(); }))
+        MIGRAPHX_THROW("Ranks must be the same");
+    visit_tensor_size(s.lens().size(),
+                      [&](auto ndim) { s.visit_type([&](auto as) { v(f(xs, ndim, as)...); }); });
+}
+
+template <class V, class F, class... Ts>
+void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
+{
+    std::initializer_list<std::size_t> ranks = {get_shape(xs).lens().size()...};
+    if(!std::all_of(
+           ranks.begin(), ranks.end(), [&](std::size_t r) { return r == s.lens().size(); }))
+        MIGRAPHX_THROW("Ranks must be the same");
+    visit_tensor_size(s.lens().size(), [&](auto ndim) { v(f(xs, ndim)...); });
+}
+
+template <class F>
+struct hip_convert
+{
+    F f;
+    template <class RawData, class N, class As>
+    auto operator()(RawData x, N ndim, As as) const
+        -> decltype(make_hip_view<ndim>(x.get_shape(), f(as.from(x.data()))))
+    {
+        return make_hip_view<ndim>(x.get_shape(), f(as.from(x.data())));
+    }
+
+    template <class N, class As>
+    auto operator()(const shape& s, N ndim, As) const
+    {
+        return make_hip_shape<ndim>(s);
+    }
+};
+
+template <class F>
+hip_convert<F> make_hip_convert(F f)
+{
+    return {f};
+}
+
+template <class F>
+struct hip_convert_view
+{
+    F f;
+    template <class T, class N>
+    auto operator()(tensor_view<T> x, N ndim) const
+    {
+        return make_hip_view<ndim>(f(x));
+    }
+
+    template <class N>
+    auto operator()(const shape& s, N ndim) const
+    {
+        return make_hip_shape<ndim>(s);
+    }
+};
+
+template <class F>
+hip_convert_view<F> make_hip_convert_view(F f)
+{
+    return {f};
+}
+
+template <class T, class... Ts>
+auto hip_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_all_impl(
+            get_shape(x), make_hip_convert([](auto* p) { return device_cast(p); }), f, x, xs...);
+    };
+}
+
+template <std::size_t N, class T, class... Ts>
+auto hip_vec_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_all_impl(get_shape(x),
+                           make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
+                           f,
+                           x,
+                           xs...);
+    };
+}
+
+template <class T, class... Ts>
+auto hip_pointer_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) { visit_all(x, xs...)([&](auto... vs) { f(device_cast(vs.data())...); }); };
+}
+
+template <class T, class... Ts>
+auto hip_visit_views(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_views_impl(get_shape(x),
+                             make_hip_convert_view([](auto v) { return device_cast(v); }),
+                             f,
+                             x,
+                             xs...);
+    };
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -11,66 +12,45 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument logsoftmax(hipStream_t stream,
-                    const migraphx::shape& output_shape,
-                    std::vector<migraphx::argument> args,
-                    int axis)
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
 {
-
-    auto lens         = output_shape.lens();
-    auto num_in_batch = lens[axis];
-    auto batch_lens   = lens;
-    batch_lens[axis]  = 1;
-    migraphx::shape batch_shape{output_shape.type(), batch_lens};
-
-    visit_all(args.back(), args.front())([&](auto output, auto input) {
-        const auto* input_ptr = device_cast(input.data());
-        auto* output_ptr      = device_cast(output.data());
-        visit_tensor_size(batch_shape.lens().size(), [&](auto n_dim) {
-            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
-            hip_tensor_descriptor<n_dim> desc_data(output_shape);
-
-            // each thread is for one item in the batch
-            gs_launch(stream, batch_shape.elements())([=](auto i) {
-                auto batch_idx = desc_batch.multi(i);
-                auto data_idx  = batch_idx;
-
-                // get max
-                auto batch_max = input_ptr[desc_data.linear(batch_idx)];
-                for(std::size_t j = 1; j < num_in_batch; ++j)
-                {
+    auto lens                  = result.get_shape().lens();
+    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
+    batch_lens[axis]           = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
+
+    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
                    data_idx[axis] = j;
-                    size_t idx     = desc_data.linear(data_idx);
-                    batch_max      = std::max(to_hip_type(batch_max), to_hip_type(input_ptr[idx]));
-                }
+                    return input[data_idx];
+                });

-                for(std::size_t j = 0; j < num_in_batch; ++j)
-                {
-                    data_idx[axis]  = j;
-                    size_t idx      = desc_data.linear(data_idx);
-                    output_ptr[idx] = input_ptr[idx] - batch_max;
-                }
-
-                auto batch_sum = ::exp(to_hip_type(output_ptr[desc_data.linear(batch_idx)]));
-                for(std::size_t j = 1; j < num_in_batch; ++j)
-                {
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
                    data_idx[axis] = j;
-                    size_t idx     = desc_data.linear(data_idx);
-                    batch_sum += ::exp(to_hip_type(output_ptr[idx]));
-                }
-                batch_sum = ::log(to_hip_type(batch_sum));
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });

-                for(std::size_t j = 0; j < num_in_batch; ++j)
-                {
-                    data_idx[axis] = j;
-                    size_t idx     = desc_data.linear(data_idx);
-                    output_ptr[idx] -= batch_sum;
-                }
+            auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
+
+            idx.local_stride(batch_item_num, [&](auto j) {
+                data_idx[axis]   = j;
+                output[data_idx] = input[data_idx] - log_batch_sum;
            });
        });
    });
-
-    return args.back();
 }

 } // namespace device

--- a/src/targets/gpu/device/pad.cpp
+++ b/src/targets/gpu/device/pad.cpp
@@ -15,33 +15,26 @@ argument
 pad(hipStream_t stream, argument result, argument arg1, float value, std::vector<std::int64_t> pads)
 {
    std::size_t nelements = arg1.get_shape().elements();
-    visit_all(result)([&](auto output) {
-        auto* outptr                 = device_cast(output.data());
-        using type                   = typename decltype(output)::value_type;
-        device_type<type> device_val = value;
+    hip_visit_all(result, arg1)([&](auto output, auto input) {
+        using type      = typename decltype(output)::value_type;
+        using hip_index = typename decltype(output)::hip_index;
+        type device_val = value;
        if(float_equal(value, std::numeric_limits<float>::lowest()))
        {
            device_val = device_cast(std::numeric_limits<type>::lowest());
        }
-        gs_launch(stream, result.get_shape().elements())([=](auto i) { outptr[i] = device_val; });
-    });
+        gs_launch(stream,
+                  result.get_shape().elements())([=](auto i) { output.data()[i] = device_val; });

-    visit_all(result, arg1)([&](auto output, auto input) {
-        visit_tensor_size(result.get_shape().lens().size(), [&](auto ndim) {
-            std::size_t offsets[ndim];
-            std::copy(pads.begin(), pads.begin() + ndim, offsets);
-            auto* outptr      = output.data();
-            const auto* inptr = input.data();
-            hip_tensor_descriptor<ndim> desc_input(input.get_shape());
-            hip_tensor_descriptor<ndim> desc_output(output.get_shape());
-            gs_launch(stream, nelements)([=](auto i) {
-                auto idx = desc_input.multi(i);
-                for(std::size_t j = 0; j < ndim; j++)
-                {
-                    idx[j] += offsets[j];
-                }
-                outptr[desc_output.linear(idx)] = inptr[i];
-            });
+        hip_index offsets;
+        std::copy(pads.begin(), pads.begin() + offsets.size(), offsets.begin());
+        gs_launch(stream, nelements)([=](auto i) {
+            auto idx = input.get_shape().multi(i);
+            for(std::size_t j = 0; j < offsets.size(); j++)
+            {
+                idx[j] += offsets[j];
+            }
+            output[idx] = input.data()[i];
        });
    });
    return result;

--- a/src/targets/gpu/device/reduce_sum.cpp
+++ b/src/targets/gpu/device/reduce_sum.cpp
+#include <migraphx/gpu/device/reduce_sum.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void reduce_sum(hipStream_t stream, const argument& result, const argument& arg)
+{
+    reduce(stream, result, arg, sum{}, 0, id{}, id{});
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/gpu/device/softmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -12,69 +13,44 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument softmax(hipStream_t stream,
-                 const migraphx::shape& output_shape,
-                 std::vector<migraphx::argument> args,
-                 int axis)
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
 {
-    auto lens        = output_shape.lens();
-    auto batch_lens  = lens;
-    size_t n_dims    = lens[axis];
-    batch_lens[axis] = 1;
-    migraphx::shape batch_shape{shape::int32_type, batch_lens};
-
-    visit_all(args.back(), args.front())([&](auto output, auto input) {
-        const auto* input_ptr = device_cast(input.data());
-        auto* output_ptr      = device_cast(output.data());
-        visit_tensor_size(batch_shape.lens().size(), [&](auto n_dim) {
-            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
-            hip_tensor_descriptor<n_dim> desc_data(output_shape);
-
-            // each thread is for one item in the batch
-            gs_launch(stream, batch_shape.elements())([=](auto i) {
-                auto batch_idx = desc_batch.multi(i);
-                auto data_idx  = batch_idx;
-                // get max
-                auto batch_max = input_ptr[desc_data.linear(batch_idx)];
-                for(std::size_t j = 1; j < n_dims; ++j)
-                {
+    auto lens                  = result.get_shape().lens();
+    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
+    batch_lens[axis]           = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
+
+    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
                    data_idx[axis] = j;
-                    batch_max      = std::max(to_hip_type(batch_max),
-                                         to_hip_type(input_ptr[desc_data.linear(data_idx)]));
-                }
-
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = input_ptr[idx] - batch_max;
-                }
+                    return input[data_idx];
+                });

-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = exp(to_hip_type(output_ptr[idx]));
-                }
-
-                auto batch_sum = output_ptr[desc_data.linear(batch_idx)];
-                for(std::size_t j = 1; j < n_dims; ++j)
-                {
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
                    data_idx[axis] = j;
-                    batch_sum += output_ptr[desc_data.linear(data_idx)];
-                }
-
-                for(std::size_t j = 0; j < n_dims; ++j)
-                {
-                    data_idx[axis]  = j;
-                    auto idx        = desc_data.linear(data_idx);
-                    output_ptr[idx] = output_ptr[idx] / batch_sum;
-                }
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
+
+            idx.local_stride(batch_item_num, [&](auto j) {
+                data_idx[axis]   = j;
+                auto val         = input[data_idx] - batch_max;
+                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
            });
        });
    });
-
-    return args.back();
 }

 } // namespace device

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -5,6 +5,7 @@
 #include <migraphx/gpu/device/add_relu.hpp>
 #include <migraphx/gpu/device/add.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/array.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -122,13 +123,6 @@ MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
           s.strides()[1] != 0 and s.strides()[2] == 0 and s.strides()[3] == 0;
 }

-// TODO: Move to another header
-template <class T, class... Ts>
-std::array<T, sizeof...(Ts) + 1> make_array(T x, Ts... xs)
-{
-    return {std::move(x), std::move(static_cast<T>(xs))...};
-}
-
 MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
 {
    if(ins->name() != "gpu::convolution")
@@ -206,12 +200,33 @@ struct hip_add_relu
    }
 };

+void move_broadcasted_back(std::vector<instruction_ref>& args)
+{
+    // Ensure the last arguments is the broadcasted one
+    auto it = std::find_if(
+        args.begin(), args.end(), [](auto arg) { return arg->get_shape().broadcasted(); });
+    if(it != args.end())
+        std::swap(*it, *std::prev(args.end(), 2));
+}
+
+void move_standard_front(std::vector<instruction_ref>& args)
+{
+    // Ensure the first arguments is the standard one
+    auto it = std::find_if(
+        args.begin(), args.end(), [](auto arg) { return arg->get_shape().standard(); });
+    if(it != args.end())
+        std::swap(*it, args.front());
+}
+
 struct find_add_relu
 {
    auto matcher() const
    {
-        return match::name("gpu::relu")(match::arg(0)(
-            match::any_of(match::name("gpu::add"), match::name("hip::triadd")).bind("add")));
+        return match::name("gpu::relu")(
+            match::arg(0)(match::any_of(match::name("gpu::add"),
+                                        match::name("hip::triadd"),
+                                        match::any_of[match::inputs()](match::standard_shape()))
+                              .bind("add")));
    }

    void apply(program& p, match::matcher_result r) const
@@ -219,6 +234,9 @@ struct find_add_relu
        auto add_ins = r.instructions["add"];
        auto ins     = r.result;
        auto args    = add_ins->inputs();
+        move_standard_front(args);
+        move_broadcasted_back(args);
+
        // Use the allocation from the relu operator
        args.back() = ins->inputs().back();
        if(add_ins->name() == "gpu::add")
@@ -232,24 +250,26 @@ struct find_triadd
 {
    auto matcher() const
    {
-        return match::name("gpu::add")(match::either_arg(0, 1)(match::name("gpu::add").bind("add"),
-                                                               match::any().bind("input")));
+        return match::name("gpu::add")(match::either_arg(0, 1)(
+            match::name("gpu::add").bind("add"),
+            match::any(match::any_of[match::inputs()](match::standard_shape())).bind("input")));
    }

    void apply(program& p, match::matcher_result r) const
    {
-        auto add_ins        = r.instructions["add"];
-        auto input_ins      = r.instructions["input"];
-        auto ins            = r.result;
-        auto args           = add_ins->inputs();
+        auto add_ins   = r.instructions["add"];
+        auto input_ins = r.instructions["input"];
+        auto ins       = r.result;
+        auto args      = add_ins->inputs();
+        assert(add_ins != input_ins);
+
        auto is_broadcasted = [](auto arg) { return arg->get_shape().broadcasted(); };
        if(std::count_if(args.begin(), args.end(), is_broadcasted) > 1)
            return;
        args.insert(args.begin(), input_ins);
-        // Ensure the last arguments is the broadcasted one
-        auto it = std::find_if(args.begin(), args.end(), is_broadcasted);
-        if(it != args.end())
-            std::swap(*it, *std::prev(args.end(), 2));
+        move_standard_front(args);
+        move_broadcasted_back(args);
+
        args.back() = ins->inputs().back();
        p.replace_instruction(ins, hip_triadd{}, args);
    }

--- a/src/targets/gpu/gather.cpp
+++ b/src/targets/gpu/gather.cpp
@@ -12,11 +12,9 @@ shape hip_gather::compute_shape(std::vector<shape> inputs) const
    return op.compute_shape(inputs);
 }

-argument hip_gather::compute(context& ctx,
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
+argument hip_gather::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::gather(ctx.get_stream().get(), output_shape, args, op.axis);
+    return device::gather(ctx.get_stream().get(), args.back(), args[0], args[1], op.axis);
 }

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/argmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMAX_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/argmax.hpp>
+#include <migraphx/gpu/device/argmax.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_argmax
+{
+    op::argmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::argmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/argmin.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARGMIN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARGMIN_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/argmin.hpp>
+#include <migraphx/gpu/device/argmin.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_argmin
+{
+    op::argmin op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::argmin"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARG_OP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARG_OP_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T>
+struct val_index
+{
+    T val;
+    int64_t index;
+};
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v)
+{
+    return {v, -1};
+}
+
+template <class T>
+MIGRAPHX_DEVICE_CONSTEXPR val_index<T> make_val_index(T v, int64_t i)
+{
+    return {v, i};
+}
+
+struct argmax_op
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val > y.val)
+            return x;
+        else if(x.val < y.val)
+            return y;
+        else
+        {
+            return (x.index < y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return lowest(); }
+};
+
+struct argmin_op
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR val_index<T> operator()(val_index<T> x, val_index<T> y) const
+    {
+        if(x.val < y.val)
+            return x;
+        else if(x.val > y.val)
+            return y;
+        else
+        {
+            return (x.index < y.index) ? x : y;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() const { return highest(); }
+};
+
+template <class Op>
+void arg_op(Op op, hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
+{
+    auto arg_shape        = arg.get_shape();
+    auto lens             = arg_shape.lens();
+    auto batch_lens       = lens;
+    size_t batch_item_num = lens[axis];
+    batch_lens[axis]      = 1;
+    migraphx::shape batch_shape{arg_shape.type(), batch_lens};
+
+    hip_visit_all(arg, arg_shape, batch_shape)([&](auto input, auto arg_s, auto batch_s) {
+        auto output = device_cast(result.get<int64_t>().data());
+        using type  = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+        // use one block for items in one batch.
+        const size_t max_block_size  = 256;
+        const std::size_t block_size = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto batch_idx = batch_s.multi(i / block_size);
+            auto data_idx  = batch_idx;
+            auto init      = make_val_index<type>(op.init());
+
+            auto op_output =
+                block_reduce<max_block_size>(idx, op, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return make_val_index(input[arg_s.index(data_idx)], j);
+                });
+
+            if(idx.local == 0)
+            {
+                output[batch_s.index(batch_idx)] = op_output.index;
+            }
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmax.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMAX_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void argmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/argmin.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARGMIN_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void argmin(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/erf.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/erf.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ERF_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ERF_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void erf(hipStream_t stream, const argument& result, const argument& arg);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
@@ -10,10 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument gather(hipStream_t stream,
-                const migraphx::shape& output_shape,
-                std::vector<migraphx::argument> args,
-                int axis);
+argument gather(hipStream_t stream, argument result, argument arg1, argument arg2, int axis);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
@@ -10,10 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument logsoftmax(hipStream_t stream,
-                    const migraphx::shape& output_shape,
-                    std::vector<migraphx::argument> args,
-                    int axis);
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/reduce_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/reduce_sum.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_SUM_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_SUM_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void reduce_sum(hipStream_t stream, const argument& result, const argument& arg);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
@@ -10,10 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument softmax(hipStream_t stream,
-                 const migraphx::shape& output_shape,
-                 std::vector<migraphx::argument> args,
-                 int axis);
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis);

 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/erf.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/erf.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ERF_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ERF_HPP
+
+#include <migraphx/gpu/oper.hpp>
+#include <migraphx/gpu/device/erf.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct hip_erf : unary_device<hip_erf, device::erf>
+{
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/reduce_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/reduce_sum.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_REDUCE_SUM_HPP
+#define MIGRAPHX_GUARD_RTGLIB_REDUCE_SUM_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/reduce_sum.hpp>
+#include <migraphx/reflect.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_reduce_sum
+{
+    op::reduce_sum op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::reduce_sum"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/logsoftmax.cpp
+++ b/src/targets/gpu/logsoftmax.cpp
@@ -15,11 +15,11 @@ shape hip_logsoftmax::compute_shape(const std::vector<shape>& inputs) const
    return op.compute_shape({inputs.at(0)});
 }

-argument hip_logsoftmax::compute(context& ctx,
-                                 const shape& output_shape,
-                                 const std::vector<argument>& args) const
+argument
+hip_logsoftmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::logsoftmax(ctx.get_stream().get(), output_shape, args, op.axis);
+    device::logsoftmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    return args.back();
 }

 } // namespace gpu