Merge branch 'opt_log_softmax_new_device_code' into argmax_min

3855c6af · Shucai Xiao · b222af2f · 93eae2df · 3855c6af · 3855c6af
Commit 3855c6af authored Jun 25, 2019 by Shucai Xiao
20 changed files
--- a/src/include/migraphx/array.hpp
+++ b/src/include/migraphx/array.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ARRAY_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ARRAY_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/requires.hpp>
+#include <type_traits>
+#include <array>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace detail {
+
+template <class R, class...>
+struct array_type
+{
+    using type = R;
+};
+template <class... Ts>
+struct array_type<void, Ts...> : std::common_type<Ts...>
+{
+};
+
+template <class R, class... Ts>
+using array_type_t = typename array_type<R, Ts...>::type;
+
+template <class T, std::size_t N, std::size_t... I>
+constexpr std::array<std::remove_cv_t<T>, N> to_array_impl(T (&a)[N], seq<I...>)
+{
+    return {{a[I]...}};
+}
+
+} // namespace detail
+
+template <class Result = void, class... Ts, MIGRAPHX_REQUIRES((sizeof...(Ts) > 0))>
+constexpr std::array<detail::array_type_t<Result, Ts...>, sizeof...(Ts)> make_array(Ts&&... xs)
+{
+    return {static_cast<detail::array_type_t<Result, Ts...>>(std::forward<Ts>(xs))...};
+}
+
+constexpr std::array<int, 0> make_array() { return {}; }
+
+template <class T, std::size_t N>
+constexpr auto to_array(T (&a)[N])
+{
+    return detail::to_array_impl(a, detail::gens<N>{});
+}
+
+namespace detail {
+
+template <std::size_t Offset = 0, class Array, std::size_t... I>
+constexpr auto rearray_impl(Array a, seq<I...>)
+{
+    return make_array(a[I + Offset]...);
+}
+
+} // namespace detail
+
+template <class T, std::size_t N>
+constexpr auto pop_front(std::array<T, N> a)
+{
+    return detail::rearray_impl(a, detail::gens<N - 1>{});
+}
+
+template <class T, std::size_t N>
+constexpr auto pop_back(std::array<T, N> a)
+{
+    return detail::rearray_impl<1>(a, detail::gens<N - 1>{});
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/functional.hpp
+++ b/src/include/migraphx/functional.hpp
@@ -15,6 +15,12 @@ struct swallow
    }
 };

+template <class T>
+auto tuple_size(const T&)
+{
+    return typename std::tuple_size<T>::type{};
+}
+
 namespace detail {

 template <class R, class F>
@@ -83,6 +89,12 @@ constexpr auto sequence_c(F&& f)
    return detail::sequence_c_impl(f, detail::gens<N>{});
 }

+template <class IntegerConstant, class F>
+constexpr auto sequence(IntegerConstant ic, F&& f)
+{
+    return sequence_c<ic>(f);
+}
+
 template <class F, class... Ts>
 constexpr void each_args(F f, Ts&&... xs)
 {
@@ -95,9 +107,9 @@ constexpr void each_args(F)
 }

 template <class F, class T>
-auto unpack(F f, T& x)
+auto unpack(F f, T&& x)
 {
-    return sequence_c<std::tuple_size<T>{}>([&](auto... is) { f(std::get<is>(x)...); });
+    return sequence(tuple_size(x), [&](auto... is) { f(std::get<is>(static_cast<T&&>(x))...); });
 }

 /// Implements a fix-point combinator
@@ -149,6 +161,35 @@ auto index_of(T& x)
    return [&](auto&& y) { return x[y]; };
 }

+template <class T, class... Ts>
+decltype(auto) front_args(T&& x, Ts&&...)
+{
+    return static_cast<T&&>(x);
+}
+
+template <class... Ts>
+decltype(auto) back_args(Ts&&... xs)
+{
+    return std::get<sizeof...(Ts) - 1>(std::tuple<Ts&&...>(static_cast<Ts&&>(xs)...));
+}
+
+template <class T, class... Ts>
+auto pop_front_args(T&&, Ts&&... xs)
+{
+    return [&](auto f) { f(static_cast<Ts&&>(xs)...); };
+}
+
+template <class... Ts>
+auto pop_back_args(Ts&&... xs)
+{
+    return [&](auto f) {
+        using tuple_type = std::tuple<Ts&&...>;
+        auto t           = tuple_type(static_cast<Ts&&>(xs)...);
+        return sequence_c<sizeof...(Ts) - 1>(
+            [&](auto... is) { return f(std::get<is>(static_cast<tuple_type&&>(t))...); });
+    };
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/include/migraphx/ranges.hpp
+++ b/src/include/migraphx/ranges.hpp
@@ -33,6 +33,10 @@ auto generic_find_impl(rank<0>, C&& c, const T& x)
    return std::find(c.begin(), c.end(), x);
 }

+struct empty
+{
+};
+
 } // namespace detail

 template <class C, class T>
@@ -71,6 +75,12 @@ bool all_of(const std::initializer_list<T>& c, const Predicate& p)
    return std::all_of(c.begin(), c.end(), p);
 }

+template <class Predicate>
+bool all_of(detail::empty, const Predicate&)
+{
+    return true;
+}
+
 template <class C, class Predicate>
 bool any_of(const C& c, const Predicate& p)
 {
@@ -83,6 +93,12 @@ bool any_of(const std::initializer_list<T>& c, const Predicate& p)
    return std::any_of(c.begin(), c.end(), p);
 }

+template <class Predicate>
+bool any_of(detail::empty, const Predicate&)
+{
+    return false;
+}
+
 template <class C, class Predicate>
 bool none_of(const C& c, const Predicate& p)
 {
@@ -95,6 +111,12 @@ bool none_of(const std::initializer_list<T>& c, const Predicate& p)
    return std::none_of(c.begin(), c.end(), p);
 }

+template <class Predicate>
+bool none_of(detail::empty, const Predicate&)
+{
+    return true;
+}
+
 template <class Range, class Iterator>
 void copy(Range&& r, Iterator it)
 {

--- a/src/include/migraphx/raw_data.hpp
+++ b/src/include/migraphx/raw_data.hpp
@@ -212,6 +212,25 @@ auto visit_all(T&& x, Ts&&... xs)
    };
 }

+template <class T>
+auto visit_all(const std::vector<T>& x)
+{
+    auto&& s = x.front().get_shape();
+    if(!std::all_of(
+           x.begin(), x.end(), [&](const T& y) { return y.get_shape().type() == s.type(); }))
+        MIGRAPHX_THROW("Types must be the same");
+    return [&](auto v) {
+        s.visit_type([&](auto as) {
+            using type = typename decltype(as)::type;
+            std::vector<tensor_view<type>> result;
+            std::transform(x.begin(), x.end(), std::back_inserter(result), [&](const auto& y) {
+                return make_view(y.get_shape(), as.from(y.data()));
+            });
+            v(result);
+        });
+    };
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -530,17 +530,26 @@ struct cpu_softmax
    std::string name() const { return "cpu::softmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

-    template <typename T>
-    std::size_t compute_batch_index(T idx, shape& batch_shape, int axis) const
+    std::vector<size_t> compute_batch_indices(size_t idx, const shape& s) const
    {
-        idx[axis] = 0;
-        return batch_shape.index(idx);
+        std::vector<std::size_t> indices(s.lens().size());
+        std::transform(s.strides().begin(),
+                       s.strides().end(),
+                       s.lens().begin(),
+                       indices.begin(),
+                       [&](std::size_t stride, std::size_t len) {
+                           assert(len > 0 and stride > 0);
+                           return (idx / stride) % len;
+                       });
+
+        return indices;
    }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto batch_lens     = output_shape.lens();
+        size_t n_dims       = batch_lens[op.axis];
        batch_lens[op.axis] = 1;
        shape batch_shape{shape::int32_type, batch_lens};

@@ -548,26 +557,34 @@ struct cpu_softmax
            using value_type = typename decltype(input)::value_type;
            std::vector<value_type> batch_max(batch_shape.elements(),
                                              std::numeric_limits<value_type>::lowest());
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
-            });
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
+            par_for(batch_shape.elements(), [&](auto i) {
+                auto idx = this->compute_batch_indices(i, batch_shape);

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) =
-                    std::exp(input(idx.begin(), idx.end()) - batch_max[index]);
-            });
+                for(size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
+                }

-            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_sum[index] += output(idx.begin(), idx.end());
-            });
+                for(size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis]  = j;
+                    size_t index  = output_shape.index(idx);
+                    output[index] = std::exp(input[index] - batch_max[i]);
+                }
+
+                for(size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_sum[i] += output(idx.begin(), idx.end());
+                }

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) /= batch_sum[index];
+                for(size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    output(idx.begin(), idx.end()) /= batch_sum[i];
+                }
            });
        });

@@ -588,48 +605,65 @@ struct cpu_logsoftmax
    std::string name() const { return "cpu::logsoftmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

-    template <typename T>
-    std::size_t compute_batch_index(T idx, const shape& batch_shape, int axis) const
+    std::vector<size_t> compute_batch_indices(size_t idx, const shape& s) const
    {
-        idx[axis] = 0;
-        return batch_shape.index(idx);
+        std::vector<std::size_t> indices(s.lens().size());
+        std::transform(s.strides().begin(),
+                       s.strides().end(),
+                       s.lens().begin(),
+                       indices.begin(),
+                       [&](std::size_t stride, std::size_t len) {
+                           assert(len > 0 and stride > 0);
+                           return (idx / stride) % len;
+                       });
+
+        return indices;
    }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        auto batch_lens     = output_shape.lens();
+        size_t n_dims       = batch_lens[op.axis];
        batch_lens[op.axis] = 1;
        shape batch_shape{shape::int32_type, batch_lens};

+        // use a parallel implementation to acheive better performance
+        // one thread for one batch
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(input)::value_type;
            std::vector<value_type> batch_max(batch_shape.elements(),
                                              std::numeric_limits<value_type>::lowest());
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
-            });
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end()) - batch_max[index];
-            });
+            par_for(batch_shape.elements(), [&](auto i) {
+                auto idx = this->compute_batch_indices(i, batch_shape);
+                for(size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
+                }

-            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_sum[index] += std::exp(output(idx.begin(), idx.end()));
-            });
+                for(size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis]  = j;
+                    size_t index  = output_shape.index(idx);
+                    output[index] = input[index] - batch_max[i];
+                }
+
+                for(size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    batch_sum[i] += std::exp(output(idx.begin(), idx.end()));
+                }

-            for(std::size_t i = 0; i < batch_sum.size(); ++i)
-            {
                batch_sum[i] = std::log(batch_sum[i]);
-            }

-            shape_for_each(output_shape, [&](auto idx) {
-                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) -= batch_sum[index];
+                for(size_t j = 0; j < n_dims; ++j)
+                {
+                    idx[op.axis] = j;
+                    output(idx.begin(), idx.end()) -= batch_sum[i];
+                }
            });
        });


--- a/src/targets/gpu/device/concat.cpp
+++ b/src/targets/gpu/device/concat.cpp
@@ -10,22 +10,20 @@ namespace gpu {
 namespace device {

 argument concat(hipStream_t stream,
-                const migraphx::shape& output_shape,
+                const migraphx::shape&,
                std::vector<migraphx::argument> args,
                std::vector<std::size_t> offsets)
 {
-    for(std::size_t l = 0; l < args.size() - 1; l++)
+    auto ninputs = args.size() - 1;
+    for(std::size_t j = 0; j < ninputs; j++)
    {
-        auto argl             = args[l];
-        std::size_t nelements = argl.get_shape().elements();
-        visit_all(args.back(), argl)([&](auto output, auto input) {
-            visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
-                auto* outptr      = output.data() + offsets[l];
-                const auto* inptr = input.data();
-                hip_tensor_descriptor<ndim> desc_input(input.get_shape());
-                hip_tensor_descriptor<ndim> desc_output(output.get_shape());
-                gs_launch(stream, nelements)(
-                    [=](auto i) { outptr[desc_output.linear(desc_input.multi(i))] = inptr[i]; });
+        auto&& arg            = args[j];
+        std::size_t nelements = arg.get_shape().elements();
+        auto offset           = offsets[j];
+        hip_visit_all(args.back(), arg)([&](auto output, auto input) {
+            gs_launch(stream, nelements)([=](auto i) {
+                auto idx                    = output.get_shape().index(input.get_shape().multi(i));
+                output.data()[idx + offset] = input.data()[i];
            });
        });
    }

--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -11,35 +11,30 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument gather(hipStream_t stream,
-                const migraphx::shape& output_shape,
-                std::vector<migraphx::argument> args,
-                int axis)
+argument gather(hipStream_t stream, argument result, argument arg1, argument arg2, int axis)
 {
-    auto axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
-    visit_all(args.back(), args[0])([&](auto output, auto input) {
-        std::size_t nelements = output_shape.elements();
-        args[1].visit([&](auto indices) {
-            const auto* indices_ptr = device_cast(indices.data());
-            auto* out_ptr           = device_cast(output.data());
-            const auto* in_ptr      = device_cast(input.data());
-            auto& input_shape       = args[0].get_shape();
-            auto lens               = input_shape.lens();
-            lens[axis_index]        = args[1].get_shape().elements();
-            migraphx::shape out_comp_shape{output_shape.type(), lens};
-            visit_tensor_size(out_comp_shape.lens().size(), [&](auto n_out_dim) {
-                hip_tensor_descriptor<n_out_dim> desc_input(input_shape);
-                hip_tensor_descriptor<n_out_dim> desc_output(out_comp_shape);
-                gs_launch(stream, nelements)([=](auto ii) {
-                    auto in_idx        = desc_output.multi(ii);
-                    in_idx[axis_index] = indices_ptr[in_idx[axis_index]];
-                    out_ptr[ii]        = in_ptr[desc_input.linear(in_idx)];
+    auto axis_index   = (axis < 0) ? (axis + arg1.get_shape().lens().size()) : axis;
+    auto& input_shape = arg1.get_shape();
+    auto lens         = input_shape.lens();
+    lens[axis_index]  = arg2.get_shape().elements();
+    shape out_comp_shape{result.get_shape().type(), lens};
+    std::size_t nelements = result.get_shape().elements();
+
+    visit_all(result, arg1)([&](auto output, auto input_v) {
+        hip_visit_views(input_v, out_comp_shape)([&](auto input, auto out_comp) {
+            arg2.visit([&](auto indices) {
+                const auto* indices_ptr = device_cast(indices.data());
+                auto* output_ptr        = device_cast(output.data());
+                gs_launch(stream, nelements)([=](auto i) {
+                    auto idx        = out_comp.multi(i);
+                    idx[axis_index] = indices_ptr[idx[axis_index]];
+                    output_ptr[i]   = input[idx];
                });
            });
        });
    });

-    return args.back();
+    return result;
 }

 } // namespace device

--- a/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ARRAY_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, std::size_t N>
+struct hip_array
+{
+    T d[N];
+    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](std::size_t i) { return d[i]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](std::size_t i) const { return d[i]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::integral_constant<std::size_t, N> size() const { return {}; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T dot(const hip_array& x) const
+    {
+        T result = 0;
+        for(std::size_t i = 0; i < N; i++)
+            result += x[i] * d[i];
+        return result;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T product() const
+    {
+        T result = 1;
+        for(std::size_t i = 0; i < N; i++)
+            result *= d[i];
+        return result;
+    }
+
+    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator*(const hip_array& x, const hip_array& y)
+    {
+        hip_array result;
+        for(std::size_t i = 0; i < N; i++)
+            result[i] = x[i] * y[i];
+        return result;
+    }
+};
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
--- a/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
+
+#include <migraphx/gpu/device/array.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <std::size_t N>
+struct hip_shape
+{
+    using hip_index                   = hip_array<std::size_t, N>;
+    hip_array<std::size_t, N> lens    = {};
+    hip_array<std::size_t, N> strides = {};
+    bool standard                     = false;
+
+    __device__ __host__ hip_shape() = default;
+
+    hip_shape(const shape& s) : standard(s.standard())
+    {
+        assert(s.lens().size() == N);
+        assert(s.strides().size() == N);
+        std::copy(s.lens().begin(), s.lens().end(), lens.begin());
+        std::copy(s.strides().begin(), s.strides().end(), strides.begin());
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t elements() const { return lens.product(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(hip_index x) const { return x.dot(strides); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(std::initializer_list<std::size_t> x) const
+    {
+        std::size_t idx = 0;
+        for(std::size_t i = 0; i < x.size(); i++)
+            idx += *(x.begin() + i) * strides[i];
+        return idx;
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(std::size_t i) const
+    {
+        if(this->standard)
+            return i;
+        else
+        {
+            const std::size_t rank = this->lens.size();
+            std::size_t s          = 1;
+            std::size_t result     = 0;
+            for(std::size_t j = 0; j < this->lens.size(); j++)
+            {
+                const std::size_t k      = rank - j - 1;
+                const std::size_t stride = this->strides[k];
+                const std::size_t len    = this->lens[k];
+                const std::size_t slen   = s * len;
+                const std::size_t idx    = (i % slen) / s;
+                result += stride * idx;
+                s = slen;
+            }
+            return result;
+        }
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR hip_index multi(std::size_t idx) const
+    {
+        hip_index result;
+        std::size_t tidx = idx;
+        for(std::size_t is = 0; is < result.size(); is++)
+        {
+            result[is] = tidx / strides[is];
+            tidx       = tidx % strides[is];
+        }
+        return result;
+    }
+};
+
+template <std::size_t N>
+hip_shape<N> make_hip_shape(const shape& x)
+{
+    return x;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP

-#include <hip/hip_runtime.h>
-#include <migraphx/functional.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/visit.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-template <class F>
-void visit_tensor_size(std::size_t n, F f)
-{
-    switch(n)
-    {
-    case 1:
-    {
-        f(std::integral_constant<std::size_t, 1>{});
-        break;
-    }
-    case 2:
-    {
-        f(std::integral_constant<std::size_t, 2>{});
-        break;
-    }
-    case 3:
-    {
-        f(std::integral_constant<std::size_t, 3>{});
-        break;
-    }
-    case 4:
-    {
-        f(std::integral_constant<std::size_t, 4>{});
-        break;
-    }
-    case 5:
-    {
-        f(std::integral_constant<std::size_t, 5>{});
-        break;
-    }
-    default: throw std::runtime_error("Unknown tensor size");
-    }
-}
-
-template <size_t NDim>
-struct hip_index
-{
-    size_t d[NDim];
-    __device__ __host__ size_t& operator[](size_t i) { return d[i]; }
-    __device__ __host__ size_t operator[](size_t i) const { return d[i]; }
-};
+template <std::size_t NDim>
+using hip_tensor_index = hip_array<std::size_t, NDim>;

-template <size_t NDim>
+template <std::size_t NDim>
 struct hip_tensor_descriptor
 {
    __device__ __host__ hip_tensor_descriptor() = default;
@@ -63,26 +22,26 @@ struct hip_tensor_descriptor
        std::copy(s.strides().begin(), s.strides().end(), strides);
    }

-    __device__ __host__ hip_index<NDim> multi(size_t idx) const
+    __device__ __host__ hip_tensor_index<NDim> multi(std::size_t idx) const
    {
-        hip_index<NDim> result{};
-        size_t tidx = idx;
-        for(size_t is = 0; is < NDim; is++)
+        hip_tensor_index<NDim> result{};
+        std::size_t tidx = idx;
+        for(std::size_t is = 0; is < NDim; is++)
        {
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
        return result;
    }
-    __device__ __host__ size_t linear(hip_index<NDim> s) const
+    __device__ __host__ std::size_t linear(hip_tensor_index<NDim> s) const
    {
-        size_t idx = 0;
-        for(size_t i = 0; i < NDim; i++)
+        std::size_t idx = 0;
+        for(std::size_t i = 0; i < NDim; i++)
            idx += s[i] * strides[i];
        return idx;
    }
-    size_t lens[NDim]    = {};
-    size_t strides[NDim] = {};
+    std::size_t lens[NDim]    = {};
+    std::size_t strides[NDim] = {};
 };

 } // namespace device

--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
+
+#include <migraphx/gpu/device/shape.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, std::size_t N>
+struct hip_tensor_view
+{
+    using value_type                      = T;
+    using hip_index                       = typename hip_shape<N>::hip_index;
+    __device__ __host__ hip_tensor_view() = default;
+    __host__ hip_tensor_view(tensor_view<T> x) : d(x.data()), s(x.get_shape()) {}
+    __host__ hip_tensor_view(T* x, const shape& ss) : d(x), s(ss) {}
+
+    MIGRAPHX_DEVICE_CONSTEXPR const hip_shape<N>& get_shape() const { return s; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t size() const { return s.elements(); }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* data() const { return d; }
+
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR value_type& operator[](U i) const
+    {
+        return d[s.index(i)];
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* end() const { return d + size(); }
+
+    private:
+    value_type* d = nullptr;
+    hip_shape<N> s{};
+};
+
+template <std::size_t N, class T>
+hip_tensor_view<T, N> make_hip_view(const shape& s, T* x)
+{
+    return {x, s};
+}
+
+template <std::size_t N, class T>
+hip_tensor_view<T, N> make_hip_view(tensor_view<T> x)
+{
+    return {x};
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -8,14 +8,45 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP

+#include <hip/hip_runtime.h>
 #include <migraphx/half.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/tensor_view.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

+#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
+
+template <class T, std::size_t N>
+using vec = T __attribute__((ext_vector_type(N)));
+
+template <std::size_t N, class T>
+__device__ __host__ T* as_pointer(vec<T, N>* x)
+{
+    return reinterpret_cast<T*>(x);
+}
+
+template <std::size_t N, class T>
+__device__ __host__ vec<T, N>* as_vec(T* x)
+{
+    return reinterpret_cast<vec<T, N>*>(x);
+}
+
+template <std::size_t N, class T>
+tensor_view<vec<T, N>> as_vec(tensor_view<T> x)
+{
+    return {x.get_shape(), as_vec<N>(x.data())};
+}
+
+template <std::size_t N, class... Ts>
+auto pack_vec(Ts... xs)
+{
+    return [=](auto f, std::size_t n) { return f(as_vec<N>(xs)[n]...); };
+}
+
 using gpu_half = __fp16;

 namespace detail {
@@ -25,6 +56,12 @@ struct device_type
    using type = T;
 };

+template <class T, std::size_t N>
+struct device_type<vec<T, N>>
+{
+    using type = vec<typename device_type<T>::type, N>;
+};
+
 template <>
 struct device_type<half>
 {
@@ -38,7 +75,7 @@ struct host_type
 };

 template <>
-struct device_type<gpu_half>
+struct host_type<gpu_half>
 {
    using type = half;
 };
@@ -75,6 +112,12 @@ device_type<T>* device_cast(T* x)
    return reinterpret_cast<device_type<T>*>(x);
 }

+template <class T>
+tensor_view<device_type<T>> device_cast(tensor_view<T> x)
+{
+    return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())};
+}
+
 template <class T>
 __device__ __host__ T to_hip_type(T x)
 {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
+
+#include <migraphx/gpu/device/types.hpp>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class T, std::size_t N>
+struct hip_vector
+{
+    MIGRAPHX_DEVICE_CONSTEXPR hip_vector() = default;
+    MIGRAPHX_DEVICE_CONSTEXPR hip_vector(std::size_t s) : len(s) {}
+    template <class Iterator>
+    __device__ __host__ hip_vector(Iterator start, Iterator last)
+    {
+        auto it = std::copy(start, last, d);
+        len     = std::distance(d, it);
+    }
+
+    __device__ __host__ hip_vector(std::initializer_list<T> x)
+    {
+        std::copy(x.begin(), x.end(), d);
+        len = x.size();
+    }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](std::size_t i) { return d[i]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](std::size_t i) const { return d[i]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[size() - 1]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[size() - 1]; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t size() const { return len; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
+
+    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
+
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void push_back(U&& x)
+    {
+        d[len] = static_cast<U&&>(x);
+        len++;
+    }
+
+    private:
+    T d[N]          = {};
+    std::size_t len = 0;
+};
+
+template <std::size_t N, class T>
+hip_vector<T, N> to_hip_vector(const std::vector<T>& x)
+{
+    hip_vector<T, N> result(x.size());
+    std::copy(x.begin(), x.end(), result.begin());
+    return result;
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
+
+#include <migraphx/gpu/device/tensor_view.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+template <class F>
+void visit_tensor_size(std::size_t n, F f)
+{
+    switch(n)
+    {
+    case 1:
+    {
+        f(std::integral_constant<std::size_t, 1>{});
+        break;
+    }
+    case 2:
+    {
+        f(std::integral_constant<std::size_t, 2>{});
+        break;
+    }
+    case 3:
+    {
+        f(std::integral_constant<std::size_t, 3>{});
+        break;
+    }
+    case 4:
+    {
+        f(std::integral_constant<std::size_t, 4>{});
+        break;
+    }
+    case 5:
+    {
+        f(std::integral_constant<std::size_t, 5>{});
+        break;
+    }
+    default: throw std::runtime_error("Unknown tensor size");
+    }
+}
+
+inline shape get_shape(const shape& x) { return x; }
+
+template <class T>
+auto get_shape(const T& x) -> decltype(x.get_shape())
+{
+    return x.get_shape();
+}
+
+template <class V, class F, class... Ts>
+void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
+{
+    std::initializer_list<migraphx::shape::type_t> types = {get_shape(xs).type()...};
+    if(!std::all_of(
+           types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
+        MIGRAPHX_THROW("Types must be the same");
+    std::initializer_list<std::size_t> ranks = {get_shape(xs).lens().size()...};
+    if(!std::all_of(
+           ranks.begin(), ranks.end(), [&](std::size_t r) { return r == s.lens().size(); }))
+        MIGRAPHX_THROW("Ranks must be the same");
+    visit_tensor_size(s.lens().size(),
+                      [&](auto ndim) { s.visit_type([&](auto as) { v(f(xs, ndim, as)...); }); });
+}
+
+template <class V, class F, class... Ts>
+void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
+{
+    std::initializer_list<std::size_t> ranks = {get_shape(xs).lens().size()...};
+    if(!std::all_of(
+           ranks.begin(), ranks.end(), [&](std::size_t r) { return r == s.lens().size(); }))
+        MIGRAPHX_THROW("Ranks must be the same");
+    visit_tensor_size(s.lens().size(), [&](auto ndim) { v(f(xs, ndim)...); });
+}
+
+template <class F>
+struct hip_convert
+{
+    F f;
+    template <class RawData, class N, class As>
+    auto operator()(RawData x, N ndim, As as) const
+        -> decltype(make_hip_view<ndim>(x.get_shape(), f(as.from(x.data()))))
+    {
+        return make_hip_view<ndim>(x.get_shape(), f(as.from(x.data())));
+    }
+
+    template <class N, class As>
+    auto operator()(const shape& s, N ndim, As) const
+    {
+        return make_hip_shape<ndim>(s);
+    }
+};
+
+template <class F>
+hip_convert<F> make_hip_convert(F f)
+{
+    return {f};
+}
+
+template <class F>
+struct hip_convert_view
+{
+    F f;
+    template <class T, class N>
+    auto operator()(tensor_view<T> x, N ndim) const
+    {
+        return make_hip_view<ndim>(f(x));
+    }
+
+    template <class N>
+    auto operator()(const shape& s, N ndim) const
+    {
+        return make_hip_shape<ndim>(s);
+    }
+};
+
+template <class F>
+hip_convert_view<F> make_hip_convert_view(F f)
+{
+    return {f};
+}
+
+template <class T, class... Ts>
+auto hip_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_all_impl(
+            get_shape(x), make_hip_convert([](auto* p) { return device_cast(p); }), f, x, xs...);
+    };
+}
+
+template <std::size_t N, class T, class... Ts>
+auto hip_vec_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_all_impl(get_shape(x),
+                           make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
+                           f,
+                           x,
+                           xs...);
+    };
+}
+
+template <class T, class... Ts>
+auto hip_pointer_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) { visit_all(x, xs...)([&](auto... vs) { f(device_cast(vs.data())...); }); };
+}
+
+template <class T, class... Ts>
+auto hip_visit_views(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_views_impl(get_shape(x),
+                             make_hip_convert_view([](auto v) { return device_cast(v); }),
+                             f,
+                             x,
+                             xs...);
+    };
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/gpu/device/reduce_opers.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -11,127 +12,82 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument logsoftmax(hipStream_t stream,
-                    const migraphx::shape& output_shape,
-                    std::vector<migraphx::argument> args,
-                    int axis)
+void logsoftmax(hipStream_t stream, argument result, argument arg, int axis)
 {

-    auto lens         = output_shape.lens();
-    auto num_in_batch = lens[axis];
-    auto batch_lens   = lens;
-    batch_lens[axis]  = 1;
-    migraphx::shape batch_shape{output_shape.type(), batch_lens};
-
-    visit_all(args.back(), args.front())([&](auto output, auto input) {
-        const auto* input_ptr = device_cast(input.data());
-        auto* output_ptr      = device_cast(output.data());
-        visit_tensor_size(batch_shape.lens().size(), [&](auto n_dim) {
-            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
-            hip_tensor_descriptor<n_dim> desc_data(output_shape);
-
-            // use one block for items in one batch.
-            // opt 1, load all data to lds then use the same approach as
-            // the current optimization
-            const size_t block_size = 1024;
-            launch(
-                stream, batch_shape.elements() * block_size, block_size)([=](auto idx) __device__ {
-                size_t thr_idx = idx.local;
-                size_t blk_idx = idx.group;
-                using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
-
-                // all data can be loaded to the lds once, so all operations are
-                // done in lds
-                MIGRAPHX_DEVICE_SHARED type lds_data[block_size + 2];
-                auto batch_idx = desc_batch.multi(blk_idx);
-                auto data_idx  = batch_idx;
-                // load data to lds and compute the batch max
-                size_t item_num      = num_in_batch;
-                lds_data[block_size] = input_ptr[0];
-                for(size_t i = thr_idx; i < num_in_batch; i += block_size)
+    auto lens           = result.get_shape().lens();
+    auto batch_item_num = lens[axis];
+    auto batch_lens     = lens;
+    batch_lens[axis]    = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
+
+    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
+        // use one block for items in one batch.
+        const size_t max_block_size = 1024;
+        size_t block_size           = 1;
+        while(block_size < max_block_size and block_size < batch_item_num)
+        {
+            block_size *= 2;
+        }
+
+        launch(stream, batch_shape.elements() * block_size, block_size)([=](auto idx) __device__ {
+            size_t thr_idx = idx.local;
+            size_t blk_idx = idx.group;
+            using type     = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
+
+            MIGRAPHX_DEVICE_SHARED type lds_data[max_block_size + 1];
+            auto batch_idx = batch.multi(blk_idx);
+            auto data_idx  = batch_idx;
+            // load data to lds and compute the batch max
+            size_t remaining_item_num = batch_item_num;
+            size_t round_item_num     = (batch_item_num + block_size - 1) / block_size * block_size;
+            lds_data[block_size]      = input[0];
+            for(size_t i = thr_idx; i < round_item_num; i += block_size)
+            {
+                if(i < batch_item_num)
                {
-                    data_idx[axis] = i;
-                    lds_data[i]    = input_ptr[desc_data.linear(data_idx)];
-
-                    __syncthreads();
-
-                    auto size   = (item_num > block_size) ? block_size : item_num;
-                    auto stride = (size + 1) / 2;
-                    while(true)
-                    {
-                        if(thr_idx + stride < size)
-                        {
-                            lds_data[thr_idx] = ::max(to_hip_type(lds_data[thr_idx]),
-                                                      to_hip_type(lds_data[thr_idx + stride]));
-                        }
-                        __syncthreads();
-                        size   = stride;
-                        stride = (stride + 1) / 2;
-
-                        if(size == 1)
-                            break;
-                    }
-
-                    if(thr_idx == 0)
-                    {
-                        lds_data[block_size] = (lds_data[0] < lds_data[block_size])
-                                                   ? lds_data[block_size]
-                                                   : lds_data[0];
-                    }
-                    __syncthreads();
-
-                    item_num -= block_size;
+                    data_idx[axis]    = i;
+                    lds_data[thr_idx] = input[data_idx];
                }
+                __syncthreads();

-                const size_t block_size1 = block_size + 1;
-                lds_data[block_size1]    = 0;
-                item_num                 = num_in_batch;
-                for(size_t i = thr_idx; i < num_in_batch; i += block_size)
-                {
-                    data_idx[axis] = i;
-                    lds_data[i]    = input_ptr[desc_data.linear(data_idx)] - lds_data[block_size];
-                    lds_data[i]    = ::exp(to_hip_type(lds_data[i]));
-
-                    __syncthreads();
-
-                    auto size   = (item_num > block_size) ? block_size : item_num;
-                    auto stride = (size + 1) / 2;
-                    while(true)
-                    {
-                        if(thr_idx + stride < size)
-                        {
-                            lds_data[thr_idx] += lds_data[thr_idx + stride];
-                        }
-                        __syncthreads();
-                        size   = stride;
-                        stride = (stride + 1) / 2;
-                        if(size == 1)
-                            break;
-                    }
-
-                    if(thr_idx == 0)
-                    {
-                        lds_data[block_size1] += lds_data[0];
-                    }
-                    __syncthreads();
-
-                    item_num -= block_size;
-                }
+                auto item_num = (remaining_item_num > block_size) ? block_size : remaining_item_num;
+                reduce_max(lds_data, block_size, thr_idx, item_num);

-                auto log_batch_sum =
-                    ::log(to_hip_type(lds_data[block_size1])) + lds_data[block_size];
-                item_num = num_in_batch;
-                for(size_t i = thr_idx; i < num_in_batch; i += block_size)
+                remaining_item_num -= block_size;
+            }
+
+            auto batch_max = lds_data[block_size];
+            __syncthreads();
+
+            lds_data[block_size] = 0;
+            remaining_item_num   = batch_item_num;
+            for(size_t i = thr_idx; i < round_item_num; i += block_size)
+            {
+                if(i < batch_item_num)
                {
                    data_idx[axis]    = i;
-                    size_t index      = desc_data.linear(data_idx);
-                    output_ptr[index] = input_ptr[index] - log_batch_sum;
+                    lds_data[thr_idx] = input[data_idx] - batch_max;
+                    lds_data[thr_idx] = ::exp(to_hip_type(lds_data[thr_idx]));
                }
-            });
+
+                __syncthreads();
+
+                auto item_num = (remaining_item_num > block_size) ? block_size : remaining_item_num;
+                reduce_sum(lds_data, block_size, thr_idx, item_num);
+
+                remaining_item_num -= block_size;
+            }
+
+            auto log_batch_sum = ::log(to_hip_type(lds_data[block_size])) + batch_max;
+
+            for(size_t i = thr_idx; i < batch_item_num; i += block_size)
+            {
+                data_idx[axis]   = i;
+                output[data_idx] = input[data_idx] - log_batch_sum;
+            }
        });
    });
-
-    return args.back();
 }

 } // namespace device

--- a/src/targets/gpu/device/pad.cpp
+++ b/src/targets/gpu/device/pad.cpp
@@ -15,33 +15,26 @@ argument
 pad(hipStream_t stream, argument result, argument arg1, float value, std::vector<std::int64_t> pads)
 {
    std::size_t nelements = arg1.get_shape().elements();
-    visit_all(result)([&](auto output) {
-        auto* outptr                 = device_cast(output.data());
-        using type                   = typename decltype(output)::value_type;
-        device_type<type> device_val = value;
+    hip_visit_all(result, arg1)([&](auto output, auto input) {
+        using type      = typename decltype(output)::value_type;
+        using hip_index = typename decltype(output)::hip_index;
+        type device_val = value;
        if(float_equal(value, std::numeric_limits<float>::lowest()))
        {
            device_val = device_cast(std::numeric_limits<type>::lowest());
        }
-        gs_launch(stream, result.get_shape().elements())([=](auto i) { outptr[i] = device_val; });
-    });
+        gs_launch(stream,
+                  result.get_shape().elements())([=](auto i) { output.data()[i] = device_val; });

-    visit_all(result, arg1)([&](auto output, auto input) {
-        visit_tensor_size(result.get_shape().lens().size(), [&](auto ndim) {
-            std::size_t offsets[ndim];
-            std::copy(pads.begin(), pads.begin() + ndim, offsets);
-            auto* outptr      = output.data();
-            const auto* inptr = input.data();
-            hip_tensor_descriptor<ndim> desc_input(input.get_shape());
-            hip_tensor_descriptor<ndim> desc_output(output.get_shape());
-            gs_launch(stream, nelements)([=](auto i) {
-                auto idx = desc_input.multi(i);
-                for(std::size_t j = 0; j < ndim; j++)
-                {
-                    idx[j] += offsets[j];
-                }
-                outptr[desc_output.linear(idx)] = inptr[i];
-            });
+        hip_index offsets;
+        std::copy(pads.begin(), pads.begin() + offsets.size(), offsets.begin());
+        gs_launch(stream, nelements)([=](auto i) {
+            auto idx = input.get_shape().multi(i);
+            for(std::size_t j = 0; j < offsets.size(); j++)
+            {
+                idx[j] += offsets[j];
+            }
+            output[idx] = input.data()[i];
        });
    });
    return result;

--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -2,6 +2,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/gpu/device/softmax.hpp>
+#include <migraphx/gpu/device/reduce_opers.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -12,122 +13,82 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-argument softmax(hipStream_t stream,
-                 const migraphx::shape& output_shape,
-                 std::vector<migraphx::argument> args,
-                 int axis)
+void softmax(hipStream_t stream, argument result, argument arg, int axis)
 {
-    auto lens        = output_shape.lens();
-    auto batch_lens  = lens;
-    size_t n_dims    = lens[axis];
-    batch_lens[axis] = 1;
-    migraphx::shape batch_shape{shape::int32_type, batch_lens};
-
-    visit_all(args.back(), args.front())([&](auto output, auto input) {
-        const auto* input_ptr = device_cast(input.data());
-        auto* output_ptr      = device_cast(output.data());
-        visit_tensor_size(batch_shape.lens().size(), [&](auto n_dim) {
-            hip_tensor_descriptor<n_dim> desc_batch(batch_shape);
-            hip_tensor_descriptor<n_dim> desc_data(output_shape);
-
-            // use one block for items in one batch.
-            const size_t block_size = 1024;
-            launch(
-                stream, batch_shape.elements() * block_size, block_size)([=](auto idx) __device__ {
-                size_t thr_idx = idx.local;
-                size_t blk_idx = idx.group;
-                using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
-
-                // all data can be loaded to the lds once, so all operations are
-                // done in lds
-                MIGRAPHX_DEVICE_SHARED type lds_data[block_size + 2];
-                auto batch_idx = desc_batch.multi(blk_idx);
-                auto data_idx  = batch_idx;
-                // load data to lds and compute the batch max
-                size_t item_num      = n_dims;
-                lds_data[block_size] = input_ptr[0];
-                for(size_t i = thr_idx; i < n_dims; i += block_size)
+    auto lens             = result.get_shape().lens();
+    auto batch_lens       = lens;
+    size_t batch_item_num = lens[axis];
+    batch_lens[axis]      = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
+
+    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
+        // use one block for items in one batch.
+        const size_t max_block_size = 1024;
+        size_t block_size           = 1;
+        while(block_size < max_block_size and block_size < batch_item_num)
+        {
+            block_size *= 2;
+        }
+
+        launch(stream, batch_shape.elements() * block_size, block_size)([=](auto idx) __device__ {
+            size_t thr_idx = idx.local;
+            size_t blk_idx = idx.group;
+            using type     = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
+
+            MIGRAPHX_DEVICE_SHARED type lds_data[max_block_size + 1];
+            auto batch_idx = batch.multi(blk_idx);
+            auto data_idx  = batch_idx;
+            // load data to lds and compute the batch max
+            size_t remaining_item_num = batch_item_num;
+            size_t round_item_num     = (batch_item_num + block_size - 1) / block_size * block_size;
+            lds_data[block_size]      = input[0];
+            for(size_t i = thr_idx; i < round_item_num; i += block_size)
+            {
+                if(i < batch_item_num)
                {
-                    data_idx[axis] = i;
-                    lds_data[i]    = input_ptr[desc_data.linear(data_idx)];
-
-                    __syncthreads();
+                    data_idx[axis]    = i;
+                    lds_data[thr_idx] = input[data_idx];
+                }

-                    auto size   = (item_num > block_size) ? block_size : item_num;
-                    auto stride = (size + 1) / 2;
-                    while(true)
-                    {
-                        if(thr_idx + stride < size)
-                        {
-                            lds_data[thr_idx] = ::max(to_hip_type(lds_data[thr_idx]),
-                                                      to_hip_type(lds_data[thr_idx + stride]));
-                        }
-                        __syncthreads();
-                        size   = stride;
-                        stride = (stride + 1) / 2;
+                __syncthreads();

-                        if(size == 1)
-                            break;
-                    }
+                auto item_num = (remaining_item_num > block_size) ? block_size : remaining_item_num;
+                reduce_max(lds_data, block_size, thr_idx, item_num);

-                    if(thr_idx == 0)
-                    {
-                        lds_data[block_size] = (lds_data[0] < lds_data[block_size])
-                                                   ? lds_data[block_size]
-                                                   : lds_data[0];
-                    }
-                    __syncthreads();
+                remaining_item_num -= block_size;
+            }

-                    item_num -= block_size;
-                }
+            auto batch_max = lds_data[block_size];
+            __syncthreads();

-                const size_t block_size1 = block_size + 1;
-                lds_data[block_size1]    = 0;
-                item_num                 = n_dims;
-                for(size_t i = thr_idx; i < n_dims; i += block_size)
+            lds_data[block_size] = 0;
+            remaining_item_num   = batch_item_num;
+            for(size_t i = thr_idx; i < round_item_num; i += block_size)
+            {
+                if(i < batch_item_num)
                {
-                    data_idx[axis] = i;
-                    lds_data[i]    = input_ptr[desc_data.linear(data_idx)] - lds_data[block_size];
-                    lds_data[i]    = ::exp(to_hip_type(lds_data[i]));
-
-                    __syncthreads();
+                    data_idx[axis]    = i;
+                    lds_data[thr_idx] = input[data_idx] - batch_max;
+                    lds_data[thr_idx] = ::exp(to_hip_type(lds_data[thr_idx]));
+                }

-                    auto size   = (item_num > block_size) ? block_size : item_num;
-                    auto stride = (size + 1) / 2;
-                    while(true)
-                    {
-                        if(thr_idx + stride < size)
-                        {
-                            lds_data[thr_idx] += lds_data[thr_idx + stride];
-                        }
-                        __syncthreads();
-                        size   = stride;
-                        stride = (stride + 1) / 2;
-                        if(size == 1)
-                            break;
-                    }
+                __syncthreads();

-                    if(thr_idx == 0)
-                    {
-                        lds_data[block_size1] += lds_data[0];
-                    }
-                    __syncthreads();
+                auto item_num = (remaining_item_num > block_size) ? block_size : remaining_item_num;
+                reduce_sum(lds_data, block_size, thr_idx, item_num);

-                    item_num -= block_size;
-                }
+                remaining_item_num -= block_size;
+            }
+            auto batch_sum = lds_data[block_size];

-                for(size_t i = thr_idx; i < n_dims; i += block_size)
-                {
-                    data_idx[axis]    = i;
-                    size_t index      = desc_data.linear(data_idx);
-                    auto val          = input_ptr[index] - lds_data[block_size];
-                    output_ptr[index] = ::exp(to_hip_type(val)) / lds_data[block_size1];
-                }
-            });
+            for(size_t i = thr_idx; i < batch_item_num; i += block_size)
+            {
+                data_idx[axis]   = i;
+                auto val         = input[data_idx] - batch_max;
+                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
+            }
        });
    });
-
-    return args.back();
 }

 } // namespace device

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -5,6 +5,7 @@
 #include <migraphx/gpu/device/add_relu.hpp>
 #include <migraphx/gpu/device/add.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/array.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -122,13 +123,6 @@ MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
           s.strides()[1] != 0 and s.strides()[2] == 0 and s.strides()[3] == 0;
 }

-// TODO: Move to another header
-template <class T, class... Ts>
-std::array<T, sizeof...(Ts) + 1> make_array(T x, Ts... xs)
-{
-    return {std::move(x), std::move(static_cast<T>(xs))...};
-}
-
 MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
 {
    if(ins->name() != "gpu::convolution")
@@ -408,8 +402,8 @@ void fuse_ops::apply(program& p) const
    // clang-format off
    match::find_matches(p, find_triadd{});
    match::find_matches(p, 
-        find_conv_bias_relu{ctx},
-        find_conv_bias{ctx},
+        // find_conv_bias_relu{ctx},
+        // find_conv_bias{ctx},
        find_add_relu{}
    );
    // clang-format on

--- a/src/targets/gpu/gather.cpp
+++ b/src/targets/gpu/gather.cpp
@@ -12,11 +12,9 @@ shape hip_gather::compute_shape(std::vector<shape> inputs) const
    return op.compute_shape(inputs);
 }

-argument hip_gather::compute(context& ctx,
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
+argument hip_gather::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    return device::gather(ctx.get_stream().get(), output_shape, args, op.axis);
+    return device::gather(ctx.get_stream().get(), args.back(), args[0], args[1], op.axis);
 }

 } // namespace gpu