merge changes from opt_log_softmax

88ed7f85 · Shucai Xiao · d13dcab5 · 77af16d8 · 88ed7f85 · 88ed7f85
Commit 88ed7f85 authored Jun 28, 2019 by Shucai Xiao
20 changed files
--- a/src/include/migraphx/generate.hpp
+++ b/src/include/migraphx/generate.hpp
@@ -25,7 +25,7 @@ constexpr T normalize(unsigned long z)
 template <class T, MIGRAPHX_REQUIRES(is_signed<T>{} and not is_floating_point<T>{})>
 constexpr T normalize(unsigned long z)
 {
-    const auto max      = std::numeric_limits<T>::max();
+    const auto max      = std::numeric_limits<T>::max() / 64;
    const auto half_max = max / 2;
    return half_max - (z % max);
 }
@@ -33,7 +33,7 @@ constexpr T normalize(unsigned long z)
 template <class T, MIGRAPHX_REQUIRES(not is_signed<T>{} and std::is_integral<T>{})>
 constexpr T normalize(unsigned long z)
 {
-    const auto max = std::numeric_limits<T>::max();
+    const auto max = std::numeric_limits<T>::max() / 64;
    return z % max;
 }

--- a/src/include/migraphx/literal.hpp
+++ b/src/include/migraphx/literal.hpp
@@ -79,6 +79,7 @@ struct literal : raw_data<literal>
    template <class Iterator>
    void fill(Iterator start, Iterator end)
    {
+        assert(std::distance(start, end) == m_shape.elements());
        if(m_shape.standard())
        {
            m_shape.visit_type([&](auto as) { std::copy(start, end, as.from(buffer.get())); });

--- a/src/include/migraphx/op/reduce_sum.hpp
+++ b/src/include/migraphx/op/reduce_sum.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SUM_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SUM_HPP
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <vector>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct reduce_sum
+{
+    std::vector<std::size_t> axes;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axes, "axes"));
+    }
+    std::string name() const { return "reduce_sum"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        auto s    = inputs.at(0);
+        auto lens = s.lens();
+        for(auto axis : axes)
+            lens[axis] = 1;
+        return {s.type(), lens};
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            shape_for_each(input.get_shape(), [&](auto&& in_idx) {
+                auto out_idx = in_idx;
+                for(auto axis : axes)
+                    out_idx[axis] = 0;
+                output(out_idx.begin(), out_idx.end()) += input(in_idx.begin(), in_idx.end());
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -44,6 +44,7 @@
 #include <migraphx/op/outline.hpp>
 #include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>
+#include <migraphx/op/reduce_sum.hpp>
 #include <migraphx/op/relu.hpp>
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/rnn.hpp>

--- a/src/pass_manager.cpp
+++ b/src/pass_manager.cpp
@@ -2,7 +2,6 @@
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
 #include <migraphx/target.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/ranges.hpp>

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/add.hpp>
+#include <migraphx/op/broadcast.hpp>
+#include <migraphx/op/concat.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/gru.hpp>
+#include <migraphx/op/lstm.hpp>
+#include <migraphx/op/mul.hpp>
+#include <migraphx/op/rnn.hpp>
+#include <migraphx/op/rnn_last_output.hpp>
+#include <migraphx/op/slice.hpp>
+#include <migraphx/op/squeeze.hpp>
+#include <migraphx/op/sub.hpp>
+#include <migraphx/op/transpose.hpp>
+#include <migraphx/op/unsqueeze.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/common.hpp>

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -2,7 +2,17 @@
 #include <migraphx/cpu/lowering.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/dfor.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/batch_norm.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/elu.hpp>
+#include <migraphx/op/im2col.hpp>
+#include <migraphx/op/leaky_relu.hpp>
+#include <migraphx/op/logsoftmax.hpp>
+#include <migraphx/op/lrn.hpp>
+#include <migraphx/op/pad.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/op/softmax.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -37,6 +37,7 @@ add_library(migraphx_device
    device/gather.cpp
    device/sub.cpp
    device/clip.cpp
+    device/reduce_sum.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -74,6 +75,7 @@ add_library(migraphx_gpu
    schedule_model.cpp
    adjust_allocation.cpp
    clip.cpp
+    reduce_sum.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
@@ -50,6 +50,14 @@ struct hip_array
            result[i] = x[i] * y[i];
        return result;
    }
+    friend MIGRAPHX_DEVICE_CONSTEXPR hip_array operator+(const hip_array& x, const hip_array& y)
+    {
+        hip_array result{};
+        for(std::size_t i = 0; i < N; i++)
+            result[i] = x[i] + y[i];
+        return result;
+    }
 };
 } // namespace device

--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -11,9 +11,33 @@ namespace device {
 struct index
 {
-    std::size_t global;
+    std::size_t global = 0;
-    std::size_t local;
+    std::size_t local  = 0;
-    std::size_t group;
+    std::size_t group  = 0;
+    __device__ std::size_t nglobal() const { return blockDim.x * gridDim.x; } // NOLINT
+    __device__ std::size_t nlocal() const { return blockDim.x; } // NOLINT
+    template <class F>
+    __device__ void global_stride(std::size_t n, F f) const
+    {
+        const auto stride = nglobal();
+        for(std::size_t i = global; i < n; i += stride)
+        {
+            f(i);
+        }
+    }
+    template <class F>
+    __device__ void local_stride(std::size_t n, F f) const
+    {
+        const auto stride = nlocal();
+        for(std::size_t i = local; i < n; i += stride)
+        {
+            f(i);
+        }
+    }
 };
 template <class F>
@@ -35,18 +59,26 @@ inline auto launch(hipStream_t stream, std::size_t global, std::size_t local)
    };
 }
+template <class F>
+__host__ __device__ auto gs_invoke(F&& f, std::size_t i, index idx) -> decltype(f(i, idx))
+{
+    return f(i, idx);
+}
+template <class F>
+__host__ __device__ auto gs_invoke(F&& f, std::size_t i, index) -> decltype(f(i))
+{
+    return f(i);
+}
 inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 1024)
 {
-    std::size_t groups  = 1 + n / local;
+    std::size_t groups  = (n + local - 1) / local;
    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
    return [=](auto f) {
-        launch(stream, nglobal, local)([=](auto idx) {
+        launch(stream, nglobal, local)(
-            for(size_t i = idx.global; i < n; i += nglobal)
+            [=](auto idx) { idx.global_stride(n, [&](auto i) { gs_invoke(f, i, idx); }); });
-            {
-                f(i);
-            }
-        });
    };
 }

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_HPP
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/visit.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+struct sum
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x + y;
+    }
+};
+struct id
+{
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    {
+        return x;
+    }
+};
+struct max
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x > y ? x : y;
+    }
+};
+struct min
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x < y ? x : y;
+    }
+};
+struct lowest
+{
+    template <class T>
+    operator T() const
+    {
+        return device_cast(std::numeric_limits<host_type<T>>::lowest());
+    }
+};
+struct highest
+{
+    template <class T>
+    operator T() const
+    {
+        return device_cast(std::numeric_limits<host_type<T>>::max());
+    }
+};
+#ifdef MIGRAPHX_NO_DPP
+template <std::size_t N, class Op, class T, class F>
+__device__ auto block_reduce(index idx, Op op, T init, std::size_t n, F f)
+{
+    using type = decltype(f(idx.local));
+    MIGRAPHX_DEVICE_SHARED type buffer[N];
+    type x = init;
+    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
+    buffer[idx.local] = x;
+    __syncthreads();
+    for(std::size_t s = 1; s < idx.nlocal(); s *= 2)
+    {
+        const std::size_t index = 2 * s * idx.local;
+        if(index + s < idx.nlocal())
+        {
+            buffer[index] = op(buffer[index], buffer[index + s]);
+        }
+        __syncthreads();
+    }
+    return buffer[0];
+}
+#else
+constexpr unsigned int dpp_row_shr(unsigned int x) { return 0x110u | x; }
+constexpr unsigned int dpp_row_bcast(unsigned int x)
+{
+    unsigned int y = 0;
+    switch(x)
+    {
+    case 15: y = 0x142; break;
+    case 31: y = 0x143; break;
+    default: throw std::runtime_error("Unknown bcast");
+    }
+    return y;
+}
+template <unsigned int DppCtrl,
+          unsigned int RowMask  = 0xf,
+          unsigned int BankMask = 0xf,
+          bool BoundCtrl        = false,
+          class T>
+__device__ T dpp_mov(T& x)
+{
+    static const std::size_t n = sizeof(T) < 4 ? 1 : sizeof(T) / 4;
+    union type
+    {
+        uint32_t reg[n];
+        T data;
+    };
+    type output{};
+    type input{};
+    // cppcheck-suppress unreadVariable
+    input.data = x;
+    for(std::size_t i = 0; i < n; i++)
+    {
+        output.reg[i] = __llvm_amdgcn_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
+    }
+    return output.data;
+}
+template <class T, class Op>
+__device__ void dpp_reduce(T& in, Op op)
+{
+    T out;
+    out = dpp_mov<dpp_row_shr(1)>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(2)>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(4), 0xf, 0xe>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
+    in  = op(in, out);
+}
+__device__ inline void dpp_reduce(float& x, sum)
+{
+#ifdef MIGRAPHX_USE_CLANG_TIDY
+    (void)x;
+#else
+    __asm__ volatile("s_nop 4\n"
+                     "v_add_f32 %0 %0 %0 row_shr:1\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:2\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:4 bank_mask:0xe\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
+                     "s_nop 1\n"
+                     "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+                     "s_nop 1\n"
+                     : "=v"(x)
+                     : "0"(x));
+#endif
+}
+template <std::size_t N, class Op, class T, class F>
+__device__ auto block_reduce(index idx, Op op, T init, std::size_t n, F f)
+{
+    using type = decltype(f(idx.local));
+    MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
+    type x = init;
+    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
+    dpp_reduce(x, op);
+    const auto ldsidx = idx.local / 64;
+    if((idx.local % 64) == 63)
+    {
+        buffer[ldsidx] = x;
+    }
+    __syncthreads();
+    type y = 0;
+    for(std::size_t i = 0; i < idx.nlocal() / 64; i++)
+    {
+        y = op(y, buffer[i]);
+    }
+    return y;
+}
+#endif
+constexpr std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
+{
+    size_t block_size = 64;
+    while(block_size < max_block_size and block_size < n)
+        block_size *= 2;
+    return block_size;
+}
+template <class Op, class T, class Input, class Output>
+void reduce(hipStream_t stream,
+            const argument& result,
+            const argument& arg,
+            Op op,
+            T init,
+            Input read_input,
+            Output read_output)
+{
+    auto&& output_shape = result.get_shape();
+    auto&& input_shape  = arg.get_shape();
+    std::vector<std::size_t> reduce_lens;
+    std::transform(output_shape.lens().begin(),
+                   output_shape.lens().end(),
+                   input_shape.lens().begin(),
+                   std::back_inserter(reduce_lens),
+                   [](auto x, auto y) -> std::size_t {
+                       if(x == y)
+                           return 1;
+                       else
+                           return y;
+                   });
+    shape reduce_slice{output_shape.type(), reduce_lens};
+    hip_visit_all(result, arg, reduce_slice)([&](auto output, auto input, auto reduce_shape) {
+        auto nelements = result.get_shape().elements();
+        auto relements = reduce_slice.elements();
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(relements, max_block_size);
+        gs_launch(stream, nelements * block_size, block_size)([=](auto i, auto idx) __device__ {
+            const auto out_idx = i / block_size;
+            auto base_idx      = output.get_shape().multi(out_idx);
+            auto r = block_reduce<max_block_size>(idx, op, init, relements, [&](auto j) __device__ {
+                auto reduce_idx = reduce_shape.multi(j);
+                return read_input(input[reduce_idx + base_idx]);
+            });
+            if(idx.local == 0)
+                output.data()[out_idx] = read_output(r);
+        });
+    });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
@@ -73,6 +73,22 @@ struct hip_shape
        }
        return result;
    }
+    MIGRAPHX_DEVICE_CONSTEXPR hip_index carry(hip_index result) const
+    {
+        std::ptrdiff_t rem = 0;
+        for(std::ptrdiff_t i = result.size() - 1; i >= 0; i--)
+        {
+            auto z = result[i] + rem;
+            rem    = z - std::ptrdiff_t(lens[i]) + 1;
+            if(rem > 0)
+                z -= rem;
+            else
+                rem = 0;
+            result[i] = z;
+        }
+        return result;
+    }
 };
 template <std::size_t N>

--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -91,7 +91,7 @@ using device_type = typename detail::device_type<T>::type;
 template <class T>
 host_type<T> host_cast(T x)
 {
-    return reinterpret_cast<host_type<T>>(x);
+    return reinterpret_cast<const host_type<T>&>(x);
 }
 template <class T>

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/logsoftmax.hpp>
-#include <migraphx/gpu/device/reduce_opers.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -14,81 +14,41 @@ namespace device {
 void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
 {
    auto lens                  = result.get_shape().lens();
-    auto batch_item_num = lens[axis];
    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
    batch_lens[axis]           = 1;
    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
-        // use one block for items in one batch.
+        const std::size_t max_block_size = 256;
-        const std::size_t max_block_size = 1024;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
-        std::size_t block_size           = 1;
+        gs_launch(stream,
-        while(block_size < max_block_size and block_size < batch_item_num)
+                  batch_shape.elements() * block_size,
-        {
+                  block_size)([=](auto i, auto idx) __device__ {
-            block_size *= 2;
+            auto data_idx = batch.multi(i / block_size);
-        }
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
-        launch(stream, batch_shape.elements() * block_size, block_size)([=](auto idx) __device__ {
-            std::size_t thr_idx = idx.local;
+            auto batch_max = block_reduce<max_block_size>(
-            std::size_t blk_idx = idx.group;
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
-            using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
+                    data_idx[axis] = j;
+                    return input[data_idx];
-            MIGRAPHX_DEVICE_SHARED type lds_data[max_block_size + 1];
+                });
-            auto batch_idx = batch.multi(blk_idx);
-            auto data_idx  = batch_idx;
-            // load data to lds and compute the batch max
-            std::size_t remaining_item_num = batch_item_num;
-            std::size_t round_item_num =
-                (batch_item_num + block_size - 1) / block_size * block_size;
-            lds_data[max_block_size] = input[0];
-            for(std::size_t i = thr_idx; i < round_item_num; i += block_size)
-            {
-                if(i < batch_item_num)
-                {
-                    data_idx[axis]    = i;
-                    lds_data[thr_idx] = input[data_idx];
-                }
-                __syncthreads();
-                auto item_num = (remaining_item_num > block_size) ? block_size : remaining_item_num;
-                block_reduce<type, max_op<type>>(
-                    lds_data, max_op<type>{}, block_size, thr_idx, item_num, max_block_size);
-                remaining_item_num -= block_size;
-            }
-            auto batch_max = lds_data[max_block_size];
-            __syncthreads();
-            lds_data[max_block_size] = 0;
-            remaining_item_num       = batch_item_num;
-            for(std::size_t i = thr_idx; i < round_item_num; i += block_size)
-            {
-                if(i < batch_item_num)
-                {
-                    data_idx[axis]    = i;
-                    lds_data[thr_idx] = input[data_idx] - batch_max;
-                    lds_data[thr_idx] = ::exp(to_hip_type(lds_data[thr_idx]));
-                }
-                __syncthreads();
-                auto item_num = (remaining_item_num > block_size) ? block_size : remaining_item_num;
-                block_reduce<type, sum_op<type>>(
-                    lds_data, sum_op<type>{}, block_size, thr_idx, item_num, max_block_size);
-                remaining_item_num -= block_size;
+            auto batch_sum =
-            }
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
-            auto log_batch_sum = ::log(to_hip_type(lds_data[max_block_size])) + batch_max;
+            auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
-            for(std::size_t i = thr_idx; i < batch_item_num; i += block_size)
+            idx.local_stride(batch_item_num, [&](auto j) {
-            {
+                data_idx[axis]   = j;
-                data_idx[axis]   = i;
                output[data_idx] = input[data_idx] - log_batch_sum;
-            }
+            });
        });
    });
 }

--- a/src/targets/gpu/device/reduce_sum.cpp
+++ b/src/targets/gpu/device/reduce_sum.cpp
+#include <migraphx/gpu/device/reduce_sum.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void reduce_sum(hipStream_t stream, const argument& result, const argument& arg)
+{
+    reduce(stream, result, arg, sum{}, 0, id{}, id{});
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -2,7 +2,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/gpu/device/softmax.hpp>
-#include <migraphx/gpu/device/reduce_opers.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -22,73 +22,33 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
-        // use one block for items in one batch.
+        const std::size_t max_block_size = 256;
-        const std::size_t max_block_size = 1024;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
-        std::size_t block_size           = 1;
+        gs_launch(stream,
-        while(block_size < max_block_size and block_size < batch_item_num)
+                  batch_shape.elements() * block_size,
-        {
+                  block_size)([=](auto i, auto idx) __device__ {
-            block_size *= 2;
+            auto data_idx = batch.multi(i / block_size);
-        }
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
-        launch(stream, batch_shape.elements() * block_size, block_size)([=](auto idx) __device__ {
-            std::size_t thr_idx = idx.local;
+            auto batch_max = block_reduce<max_block_size>(
-            std::size_t blk_idx = idx.group;
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
-            using type = device_type<std::remove_cv_t<typename decltype(output)::value_type>>;
+                    data_idx[axis] = j;
+                    return input[data_idx];
-            MIGRAPHX_DEVICE_SHARED type lds_data[max_block_size + 1];
+                });
-            auto batch_idx = batch.multi(blk_idx);
-            auto data_idx  = batch_idx;
-            // load data to lds and compute the batch max
-            std::size_t remaining_item_num = batch_item_num;
-            std::size_t round_item_num =
-                (batch_item_num + block_size - 1) / block_size * block_size;
-            lds_data[max_block_size] = input[0];
-            for(std::size_t i = thr_idx; i < round_item_num; i += block_size)
-            {
-                if(i < batch_item_num)
-                {
-                    data_idx[axis]    = i;
-                    lds_data[thr_idx] = input[data_idx];
-                }
-                __syncthreads();
-                auto item_num = (remaining_item_num > block_size) ? block_size : remaining_item_num;
-                block_reduce<type, max_op<type>>(
-                    lds_data, max_op<type>{}, block_size, thr_idx, item_num, max_block_size);
-                remaining_item_num -= block_size;
-            }
-            auto batch_max = lds_data[max_block_size];
-            __syncthreads();
-            lds_data[max_block_size] = 0;
-            remaining_item_num       = batch_item_num;
-            for(std::size_t i = thr_idx; i < round_item_num; i += block_size)
-            {
-                if(i < batch_item_num)
-                {
-                    data_idx[axis]    = i;
-                    lds_data[thr_idx] = input[data_idx] - batch_max;
-                    lds_data[thr_idx] = ::exp(to_hip_type(lds_data[thr_idx]));
-                }
-                __syncthreads();
-                auto item_num = (remaining_item_num > block_size) ? block_size : remaining_item_num;
-                block_reduce<type, sum_op<type>>(
-                    lds_data, sum_op<type>{}, block_size, thr_idx, item_num, max_block_size);
-                remaining_item_num -= block_size;
+            auto batch_sum =
-            }
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
-            auto batch_sum = lds_data[max_block_size];
+                    data_idx[axis] = j;
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
-            for(std::size_t i = thr_idx; i < batch_item_num; i += block_size)
+            idx.local_stride(batch_item_num, [&](auto j) {
-            {
+                data_idx[axis]   = j;
-                data_idx[axis]   = i;
                auto val         = input[data_idx] - batch_max;
                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
-            }
+            });
        });
    });
 }

--- a/src/targets/gpu/include/migraphx/gpu/device/reduce_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/reduce_sum.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_SUM_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_REDUCE_SUM_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void reduce_sum(hipStream_t stream, const argument& result, const argument& arg);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/reduce_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/reduce_sum.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_REDUCE_SUM_HPP
+#define MIGRAPHX_GUARD_RTGLIB_REDUCE_SUM_HPP
+#include <migraphx/shape.hpp>
+#include <migraphx/op/reduce_sum.hpp>
+#include <migraphx/reflect.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_reduce_sum
+{
+    op::reduce_sum op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::reduce_sum"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -49,6 +49,7 @@
 #include <migraphx/gpu/lrn.hpp>
 #include <migraphx/gpu/convert.hpp>
 #include <migraphx/gpu/clip.hpp>
+#include <migraphx/gpu/reduce_sum.hpp>
 #include <utility>
 #include <functional>
 #include <algorithm>
@@ -109,6 +110,7 @@ struct miopen_apply
        add_extend_op<hip_pad, op::pad>("pad");
        add_extend_op<hip_convert, op::convert>("convert");
        add_extend_op<hip_clip, op::clip>("clip");
+        add_extend_op<hip_reduce_sum, op::reduce_sum>("reduce_sum");
        add_lrn_op();
        add_convolution_op();

--- a/src/targets/gpu/reduce_sum.cpp
+++ b/src/targets/gpu/reduce_sum.cpp
+#include <migraphx/gpu/reduce_sum.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/reduce_sum.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_reduce_sum::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+argument
+hip_reduce_sum::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::reduce_sum(ctx.get_stream().get(), args.back(), args.front());
+    return args.back();
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx