Merge branch 'develop' into tests

20b1d690 · Paul · 17aaaa1e · ba729cfc · 20b1d690 · 20b1d690
Commit 20b1d690 authored Sep 20, 2019 by Paul
20 changed files
--- a/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SHAPE_HPP
+#include <migraphx/gpu/device/array.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+template <std::size_t N>
+struct hip_shape
+{
+    using hip_index                   = hip_array<std::size_t, N>;
+    hip_array<std::size_t, N> lens    = {};
+    hip_array<std::size_t, N> strides = {};
+    bool standard                     = false;
+    __device__ __host__ hip_shape() = default;
+    hip_shape(const shape& s) : standard(s.standard())
+    {
+        assert(s.lens().size() == N);
+        assert(s.strides().size() == N);
+        std::copy(s.lens().begin(), s.lens().end(), lens.begin());
+        std::copy(s.strides().begin(), s.strides().end(), strides.begin());
+    }
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t elements() const { return lens.product(); }
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(hip_index x) const { return x.dot(strides); }
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(std::initializer_list<std::size_t> x) const
+    {
+        std::size_t idx = 0;
+        for(std::size_t i = 0; i < x.size(); i++)
+            idx += *(x.begin() + i) * strides[i];
+        return idx;
+    }
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t index(std::size_t i) const
+    {
+        if(this->standard)
+            return i;
+        else
+        {
+            const std::size_t rank = this->lens.size();
+            std::size_t s          = 1;
+            std::size_t result     = 0;
+            for(std::size_t j = 0; j < this->lens.size(); j++)
+            {
+                const std::size_t k      = rank - j - 1;
+                const std::size_t stride = this->strides[k];
+                const std::size_t len    = this->lens[k];
+                const std::size_t slen   = s * len;
+                const std::size_t idx    = (i % slen) / s;
+                result += stride * idx;
+                s = slen;
+            }
+            return result;
+        }
+    }
+    MIGRAPHX_DEVICE_CONSTEXPR hip_index multi(std::size_t idx) const
+    {
+        hip_index result;
+        std::size_t tidx = idx;
+        for(std::size_t is = 0; is < result.size(); is++)
+        {
+            result[is] = tidx / strides[is];
+            tidx       = tidx % strides[is];
+        }
+        return result;
+    }
+    MIGRAPHX_DEVICE_CONSTEXPR hip_index carry(hip_index result) const
+    {
+        std::ptrdiff_t rem = 0;
+        for(std::ptrdiff_t i = result.size() - 1; i >= 0; i--)
+        {
+            auto z = result[i] + rem;
+            rem    = z - std::ptrdiff_t(lens[i]) + 1;
+            if(rem > 0)
+                z -= rem;
+            else
+                rem = 0;
+            result[i] = z;
+        }
+        return result;
+    }
+};
+template <std::size_t N>
+hip_shape<N> make_hip_shape(const shape& x)
+{
+    return x;
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
-#include <hip/hip_runtime.h>
+#include <migraphx/gpu/device/visit.hpp>
-#include <migraphx/functional.hpp>
-#include <migraphx/config.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-template <class F>
+template <std::size_t NDim>
-void visit_tensor_size(std::size_t n, F f)
+using hip_tensor_index = hip_array<std::size_t, NDim>;
-{
-    switch(n)
-    {
-    case 1:
-    {
-        f(std::integral_constant<std::size_t, 1>{});
-        break;
-    }
-    case 2:
-    {
-        f(std::integral_constant<std::size_t, 2>{});
-        break;
-    }
-    case 3:
-    {
-        f(std::integral_constant<std::size_t, 3>{});
-        break;
-    }
-    case 4:
-    {
-        f(std::integral_constant<std::size_t, 4>{});
-        break;
-    }
-    case 5:
-    {
-        f(std::integral_constant<std::size_t, 5>{});
-        break;
-    }
-    default: throw std::runtime_error("Unknown tensor size");
-    }
-}
-template <size_t NDim>
+template <std::size_t NDim>
-struct hip_index
-{
-    size_t d[NDim];
-    __device__ __host__ size_t& operator[](size_t i) { return d[i]; }
-    __device__ __host__ size_t operator[](size_t i) const { return d[i]; }
-};
-template <size_t NDim>
 struct hip_tensor_descriptor
 {
    __device__ __host__ hip_tensor_descriptor() = default;
@@ -63,26 +22,27 @@ struct hip_tensor_descriptor
        std::copy(s.strides().begin(), s.strides().end(), strides);
    }
-    __device__ __host__ hip_index<NDim> multi(size_t idx) const
+    __device__ __host__ hip_tensor_index<NDim> multi(std::size_t idx) const
    {
-        hip_index<NDim> result{};
+        hip_tensor_index<NDim> result{};
-        size_t tidx = idx;
+        std::size_t tidx = idx;
-        for(size_t is = 0; is < NDim; is++)
+        for(std::size_t is = 0; is < NDim; is++)
        {
            result[is] = tidx / strides[is];
            tidx       = tidx % strides[is];
        }
        return result;
    }
-    __device__ __host__ size_t linear(hip_index<NDim> s) const
+    __device__ __host__ std::size_t linear(hip_tensor_index<NDim> s) const
    {
-        size_t idx = 0;
+        std::size_t idx = 0;
-        for(size_t i = 0; i < NDim; i++)
+        for(std::size_t i = 0; i < NDim; i++)
            idx += s[i] * strides[i];
        return idx;
    }
-    size_t lens[NDim]    = {};
+    std::size_t lens[NDim]    = {};
-    size_t strides[NDim] = {};
+    std::size_t strides[NDim] = {};
 };
 } // namespace device

--- a/src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/tensor_view.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TENSOR_VIEW_HPP
+#include <migraphx/gpu/device/shape.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+template <class T, std::size_t N>
+struct hip_tensor_view
+{
+    using value_type                      = T;
+    using hip_index                       = typename hip_shape<N>::hip_index;
+    __device__ __host__ hip_tensor_view() = default;
+    __host__ hip_tensor_view(tensor_view<T> x) : d(x.data()), s(x.get_shape()) {}
+    __host__ hip_tensor_view(T* x, const shape& ss) : d(x), s(ss) {}
+    MIGRAPHX_DEVICE_CONSTEXPR const hip_shape<N>& get_shape() const { return s; }
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t size() const { return s.elements(); }
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* data() const { return d; }
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR value_type& operator[](U i) const
+    {
+        return d[s.index(i)];
+    }
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* begin() const { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR value_type* end() const { return d + size(); }
+    private:
+    value_type* d = nullptr;
+    hip_shape<N> s{};
+};
+template <std::size_t N, class T>
+hip_tensor_view<T, N> make_hip_view(const shape& s, T* x)
+{
+    return {x, s};
+}
+template <std::size_t N, class T>
+hip_tensor_view<T, N> make_hip_view(tensor_view<T> x)
+{
+    return {x};
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -8,14 +8,45 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
+#include <hip/hip_runtime.h>
 #include <migraphx/half.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/tensor_view.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
+#define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
+template <class T, std::size_t N>
+using vec = T __attribute__((ext_vector_type(N)));
+template <std::size_t N, class T>
+__device__ __host__ T* as_pointer(vec<T, N>* x)
+{
+    return reinterpret_cast<T*>(x);
+}
+template <std::size_t N, class T>
+__device__ __host__ vec<T, N>* as_vec(T* x)
+{
+    return reinterpret_cast<vec<T, N>*>(x);
+}
+template <std::size_t N, class T>
+tensor_view<vec<T, N>> as_vec(tensor_view<T> x)
+{
+    return {x.get_shape(), as_vec<N>(x.data())};
+}
+template <std::size_t N, class... Ts>
+auto pack_vec(Ts... xs)
+{
+    return [=](auto f, std::size_t n) { return f(as_vec<N>(xs)[n]...); };
+}
 using gpu_half = __fp16;
 namespace detail {
@@ -25,6 +56,12 @@ struct device_type
    using type = T;
 };
+template <class T, std::size_t N>
+struct device_type<vec<T, N>>
+{
+    using type = vec<typename device_type<T>::type, N>;
+};
 template <>
 struct device_type<half>
 {
@@ -38,7 +75,7 @@ struct host_type
 };
 template <>
-struct device_type<gpu_half>
+struct host_type<gpu_half>
 {
    using type = half;
 };
@@ -54,7 +91,7 @@ using device_type = typename detail::device_type<T>::type;
 template <class T>
 host_type<T> host_cast(T x)
 {
-    return reinterpret_cast<host_type<T>>(x);
+    return reinterpret_cast<const host_type<T>&>(x);
 }
 template <class T>
@@ -64,9 +101,9 @@ host_type<T>* host_cast(T* x)
 }
 template <class T>
-device_type<T> device_cast(T x)
+device_type<T> device_cast(const T& x)
 {
-    return reinterpret_cast<device_type<T>>(x);
+    return reinterpret_cast<const device_type<T>&>(x);
 }
 template <class T>
@@ -76,13 +113,19 @@ device_type<T>* device_cast(T* x)
 }
 template <class T>
-T to_hip_type(T x)
+tensor_view<device_type<T>> device_cast(tensor_view<T> x)
+{
+    return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())};
+}
+template <class T>
+__device__ __host__ T to_hip_type(T x)
 {
    return x;
 }
 // Hip doens't support __fp16
-inline float to_hip_type(gpu_half x) { return x; }
+inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/vector.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VECTOR_HPP
+#include <migraphx/gpu/device/types.hpp>
+#include <vector>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+template <class T, std::size_t N>
+struct hip_vector
+{
+    MIGRAPHX_DEVICE_CONSTEXPR hip_vector() = default;
+    MIGRAPHX_DEVICE_CONSTEXPR hip_vector(std::size_t s) : len(s) {}
+    template <class Iterator>
+    __device__ __host__ hip_vector(Iterator start, Iterator last)
+    {
+        auto it = std::copy(start, last, d);
+        len     = std::distance(d, it);
+    }
+    __device__ __host__ hip_vector(std::initializer_list<T> x)
+    {
+        std::copy(x.begin(), x.end(), d);
+        len = x.size();
+    }
+    MIGRAPHX_DEVICE_CONSTEXPR T& operator[](std::size_t i) { return d[i]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& operator[](std::size_t i) const { return d[i]; }
+    MIGRAPHX_DEVICE_CONSTEXPR T& front() { return d[0]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& front() const { return d[0]; }
+    MIGRAPHX_DEVICE_CONSTEXPR T& back() { return d[size() - 1]; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T& back() const { return d[size() - 1]; }
+    MIGRAPHX_DEVICE_CONSTEXPR T* data() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* data() const { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR std::size_t size() const { return len; }
+    MIGRAPHX_DEVICE_CONSTEXPR T* begin() { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* begin() const { return d; }
+    MIGRAPHX_DEVICE_CONSTEXPR T* end() { return d + size(); }
+    MIGRAPHX_DEVICE_CONSTEXPR const T* end() const { return d + size(); }
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void push_back(U&& x)
+    {
+        d[len] = static_cast<U&&>(x);
+        len++;
+    }
+    private:
+    T d[N]          = {};
+    std::size_t len = 0;
+};
+template <std::size_t N, class T>
+hip_vector<T, N> to_hip_vector(const std::vector<T>& x)
+{
+    hip_vector<T, N> result(x.size());
+    std::copy(x.begin(), x.end(), result.begin());
+    return result;
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_VISIT_HPP
+#include <migraphx/gpu/device/tensor_view.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+template <class F>
+void visit_tensor_size(std::size_t n, F f)
+{
+    switch(n)
+    {
+    case 1:
+    {
+        f(std::integral_constant<std::size_t, 1>{});
+        break;
+    }
+    case 2:
+    {
+        f(std::integral_constant<std::size_t, 2>{});
+        break;
+    }
+    case 3:
+    {
+        f(std::integral_constant<std::size_t, 3>{});
+        break;
+    }
+    case 4:
+    {
+        f(std::integral_constant<std::size_t, 4>{});
+        break;
+    }
+    case 5:
+    {
+        f(std::integral_constant<std::size_t, 5>{});
+        break;
+    }
+    default: throw std::runtime_error("Unknown tensor size");
+    }
+}
+inline shape get_shape(const shape& x) { return x; }
+template <class T>
+auto get_shape(const T& x) -> decltype(x.get_shape())
+{
+    return x.get_shape();
+}
+template <class V, class F, class... Ts>
+void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
+{
+    std::initializer_list<migraphx::shape::type_t> types = {get_shape(xs).type()...};
+    if(!std::all_of(
+           types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
+        MIGRAPHX_THROW("Types must be the same");
+    std::initializer_list<std::size_t> ranks = {get_shape(xs).lens().size()...};
+    if(!std::all_of(
+           ranks.begin(), ranks.end(), [&](std::size_t r) { return r == s.lens().size(); }))
+        MIGRAPHX_THROW("Ranks must be the same");
+    visit_tensor_size(s.lens().size(),
+                      [&](auto ndim) { s.visit_type([&](auto as) { v(f(xs, ndim, as)...); }); });
+}
+template <class V, class F, class... Ts>
+void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
+{
+    std::initializer_list<std::size_t> ranks = {get_shape(xs).lens().size()...};
+    if(!std::all_of(
+           ranks.begin(), ranks.end(), [&](std::size_t r) { return r == s.lens().size(); }))
+        MIGRAPHX_THROW("Ranks must be the same");
+    visit_tensor_size(s.lens().size(), [&](auto ndim) { v(f(xs, ndim)...); });
+}
+template <class F>
+struct hip_convert
+{
+    F f;
+    template <class RawData, class N, class As>
+    auto operator()(RawData x, N ndim, As as) const
+        -> decltype(make_hip_view<ndim>(x.get_shape(), f(as.from(x.data()))))
+    {
+        return make_hip_view<ndim>(x.get_shape(), f(as.from(x.data())));
+    }
+    template <class N, class As>
+    auto operator()(const shape& s, N ndim, As) const
+    {
+        return make_hip_shape<ndim>(s);
+    }
+};
+template <class F>
+hip_convert<F> make_hip_convert(F f)
+{
+    return {f};
+}
+template <class F>
+struct hip_convert_view
+{
+    F f;
+    template <class T, class N>
+    auto operator()(tensor_view<T> x, N ndim) const
+    {
+        return make_hip_view<ndim>(f(x));
+    }
+    template <class N>
+    auto operator()(const shape& s, N ndim) const
+    {
+        return make_hip_shape<ndim>(s);
+    }
+};
+template <class F>
+hip_convert_view<F> make_hip_convert_view(F f)
+{
+    return {f};
+}
+template <class T, class... Ts>
+auto hip_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_all_impl(
+            get_shape(x), make_hip_convert([](auto* p) { return device_cast(p); }), f, x, xs...);
+    };
+}
+template <std::size_t N, class T, class... Ts>
+auto hip_vec_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_all_impl(get_shape(x),
+                           make_hip_convert([](auto* p) { return as_vec<N>(device_cast(p)); }),
+                           f,
+                           x,
+                           xs...);
+    };
+}
+template <class T, class... Ts>
+auto hip_pointer_visit_all(T&& x, Ts&&... xs)
+{
+    return [&](auto f) { visit_all(x, xs...)([&](auto... vs) { f(device_cast(vs.data())...); }); };
+}
+template <class T, class... Ts>
+auto hip_visit_views(T&& x, Ts&&... xs)
+{
+    return [&](auto f) {
+        hip_visit_views_impl(get_shape(x),
+                             make_hip_convert_view([](auto v) { return device_cast(v); }),
+                             f,
+                             x,
+                             xs...);
+    };
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/int8_gemm_pack.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto comp_shape    = arg.get_shape();
+    auto out_lens      = comp_shape.lens();
+    auto dim_0         = out_lens.size() - 2;
+    auto dim_1         = out_lens.size() - 1;
+    std::size_t lda    = comp_shape.strides()[dim_0];
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = comp_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
+            gs_launch(stream, nelements, 256)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_m    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_m + (i_k / nb) * lda) * nb + offset] =
+                    in_ptr[i_m + i_k * lda + offset];
+            });
+        });
+    });
+}
+void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument& arg)
+{
+    auto trans_shape = arg.get_shape();
+    auto out_lens    = trans_shape.lens();
+    auto dim_0       = trans_shape.lens().size() - 2;
+    auto dim_1       = trans_shape.lens().size() - 1;
+    std::size_t ldb  = trans_shape.strides()[dim_1];
+    auto wrap_lens = out_lens;
+    std::swap(wrap_lens[dim_0], wrap_lens[dim_1]);
+    shape comp_shape{trans_shape.type(), wrap_lens};
+    std::size_t m_size = out_lens[dim_0] * out_lens[dim_1];
+    visit_all(result, arg)([&](auto output, auto input) {
+        std::size_t nelements = comp_shape.elements();
+        auto* out_ptr         = device_cast(output.data());
+        auto* in_ptr          = device_cast(input.data());
+        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(comp_shape);
+            gs_launch(stream, nelements, 256)([=](auto ii) {
+                const size_t nb    = 4;
+                auto idx           = desc.multi(ii);
+                std::size_t i_n    = idx[dim_1];
+                std::size_t i_k    = idx[dim_0];
+                std::size_t offset = ii / m_size * m_size;
+                out_ptr[i_k % nb + (i_n + (i_k / nb) * ldb) * nb + offset] =
+                    in_ptr[i_n + i_k * ldb + offset];
+            });
+        });
+    });
+}
+void sync_stream(hipStream_t stream) { hipStreamSynchronize(stream); }
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
 #include <migraphx/gpu/device/types.hpp>
@@ -11,57 +12,45 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-argument logsoftmax(hipStream_t stream,
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
-                    const migraphx::shape& output_shape,
-                    std::vector<migraphx::argument> args,
-                    int axis)
 {
+    auto lens                  = result.get_shape().lens();
-    auto lens              = output_shape.lens();
+    auto batch_lens            = lens;
-    std::size_t batch_size = std::accumulate(
+    std::size_t batch_item_num = lens[axis];
-        lens.begin(), lens.begin() + axis, std::size_t{1}, std::multiplies<std::size_t>());
+    batch_lens[axis]           = 1;
-    std::size_t n_dims = std::accumulate(
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
-        lens.begin() + axis, lens.end(), std::size_t{1}, std::multiplies<std::size_t>());
-    migraphx::shape comp_shape{output_shape.type(), {batch_size, n_dims}};
+    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
+        const std::size_t max_block_size = 256;
-    visit_all(args.back(), args.front())([&](auto output, auto input) {
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
-        const auto* input_ptr = device_cast(input.data());
+        gs_launch(stream,
-        auto* output_ptr      = device_cast(output.data());
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
-        // each thread is for one item in the batch
+            auto data_idx = batch.multi(i / block_size);
-        gs_launch(stream, batch_size)([=](auto i) {
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
-            std::size_t row_start = i * n_dims;
+            type init     = lowest();
-            // get max
-            auto batch_max = input_ptr[row_start];
+            auto batch_max = block_reduce<max_block_size>(
-            for(std::size_t j = 1; j < n_dims; ++j)
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
-            {
+                    data_idx[axis] = j;
-                auto ind  = row_start + j;
+                    return input[data_idx];
-                batch_max = std::max(to_hip_type(batch_max), to_hip_type(input_ptr[ind]));
+                });
-            }
+            auto batch_sum =
-            for(std::size_t j = 0; j < n_dims; ++j)
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
-            {
+                    data_idx[axis] = j;
-                auto ind        = row_start + j;
+                    auto val       = input[data_idx] - batch_max;
-                output_ptr[ind] = input_ptr[ind] - batch_max;
+                    return ::exp(to_hip_type(val));
-            }
+                });
-            auto batch_sum = ::exp(to_hip_type(output_ptr[row_start]));
+            auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
-            for(std::size_t j = 1; j < n_dims; ++j)
-            {
+            idx.local_stride(batch_item_num, [&](auto j) {
-                auto ind = row_start + j;
+                data_idx[axis]   = j;
-                batch_sum += ::exp(to_hip_type(output_ptr[ind]));
+                output[data_idx] = input[data_idx] - log_batch_sum;
-            }
+            });
-            batch_sum = ::log(to_hip_type(batch_sum));
-            for(std::size_t j = 0; j < n_dims; ++j)
-            {
-                auto ind = row_start + j;
-                output_ptr[ind] -= batch_sum;
-            }
        });
    });
-    return args.back();
 }
 } // namespace device

--- a/src/targets/gpu/device/mul_add.cpp
+++ b/src/targets/gpu/device/mul_add.cpp
+#include <migraphx/gpu/device/add_unary.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void mul_add(hipStream_t stream,
+             const argument& result,
+             const argument& arg1,
+             const argument& arg2,
+             const argument& arg3)
+{
+    nary(stream, result, arg1, arg2, arg3)([](auto x, auto a, auto b) { return a * x + b; });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/pad.cpp
+++ b/src/targets/gpu/device/pad.cpp
@@ -4,6 +4,7 @@
 #include <migraphx/gpu/device/pad.hpp>
 #include <migraphx/gpu/device/tensor.hpp>
 #include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/float_equal.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -14,24 +15,26 @@ argument
 pad(hipStream_t stream, argument result, argument arg1, float value, std::vector<std::int64_t> pads)
 {
    std::size_t nelements = arg1.get_shape().elements();
+    hip_visit_all(result, arg1)([&](auto output, auto input) {
+        using type      = typename decltype(output)::value_type;
+        using hip_index = typename decltype(output)::hip_index;
+        type device_val = value;
+        if(float_equal(value, std::numeric_limits<float>::lowest()))
+        {
+            device_val = device_cast(std::numeric_limits<type>::lowest());
+        }
+        gs_launch(stream,
+                  result.get_shape().elements())([=](auto i) { output.data()[i] = device_val; });
-    nary(stream, result)([=] { return value; });
+        hip_index offsets;
-    visit_all(result, arg1)([&](auto output, auto input) {
+        std::copy(pads.begin(), pads.begin() + offsets.size(), offsets.begin());
-        visit_tensor_size(result.get_shape().lens().size(), [&](auto ndim) {
+        gs_launch(stream, nelements)([=](auto i) {
-            std::size_t offsets[ndim];
+            auto idx = input.get_shape().multi(i);
-            std::copy(pads.begin(), pads.begin() + ndim, offsets);
+            for(std::size_t j = 0; j < offsets.size(); j++)
-            auto* outptr      = output.data();
+            {
-            const auto* inptr = input.data();
+                idx[j] += offsets[j];
-            hip_tensor_descriptor<ndim> desc_input(input.get_shape());
+            }
-            hip_tensor_descriptor<ndim> desc_output(output.get_shape());
+            output[idx] = input.data()[i];
-            gs_launch(stream, nelements)([=](auto i) {
-                auto idx = desc_input.multi(i);
-                for(std::size_t j = 0; j < ndim; j++)
-                {
-                    idx[j] += offsets[j];
-                }
-                outptr[desc_output.linear(idx)] = inptr[i];
-            });
        });
    });
    return result;

--- a/src/targets/gpu/device/pow.cpp
+++ b/src/targets/gpu/device/pow.cpp
+#include <migraphx/gpu/device/pow.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void pow(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)(
+        [](auto b, auto e) { return ::pow(to_hip_type(b), to_hip_type(e)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/reduce_mean.cpp
+++ b/src/targets/gpu/device/reduce_mean.cpp
+#include <migraphx/gpu/device/reduce_mean.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void reduce_mean(hipStream_t stream, const argument& result, const argument& arg)
+{
+    std::size_t item_num = arg.get_shape().elements() / result.get_shape().elements();
+    reduce(stream, result, arg, sum{}, 0, id{}, mean{item_num});
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/reduce_sum.cpp
+++ b/src/targets/gpu/device/reduce_sum.cpp
+#include <migraphx/gpu/device/reduce_sum.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void reduce_sum(hipStream_t stream, const argument& result, const argument& arg)
+{
+    reduce(stream, result, arg, sum{}, 0, id{}, id{});
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/relu.cpp
+++ b/src/targets/gpu/device/relu.cpp
+#include <migraphx/gpu/device/relu.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void relu(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return std::max<decltype(x)>(0, x); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/round.cpp
+++ b/src/targets/gpu/device/round.cpp
+#include <migraphx/gpu/device/round.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void round(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::round(to_hip_type(x)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/rsqrt.cpp
+++ b/src/targets/gpu/device/rsqrt.cpp
+#include <migraphx/gpu/device/rsqrt.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void rsqrt(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) __device__ { return ::rsqrt(to_hip_type(x)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/sigmoid.cpp
+++ b/src/targets/gpu/device/sigmoid.cpp
+#include <migraphx/gpu/device/sigmoid.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void sigmoid(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return 1.f / (1.f + ::exp(to_hip_type(-x))); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/sign.cpp
+++ b/src/targets/gpu/device/sign.cpp
+#include <migraphx/gpu/device/sign.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void sign(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return (x > 0 ? 1 : ((x < 0) ? -1 : 0)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/softmax.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
+{
+    auto lens                  = result.get_shape().lens();
+    auto batch_lens            = lens;
+    std::size_t batch_item_num = lens[axis];
+    batch_lens[axis]           = 1;
+    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
+    hip_visit_all(result, arg, batch_shape)([&](auto output, auto input, auto batch) {
+        const std::size_t max_block_size = 256;
+        const std::size_t block_size     = compute_block_size(batch_item_num, max_block_size);
+        gs_launch(stream,
+                  batch_shape.elements() * block_size,
+                  block_size)([=](auto i, auto idx) __device__ {
+            auto data_idx = batch.multi(i / block_size);
+            using type    = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
+            type init     = lowest();
+            auto batch_max = block_reduce<max_block_size>(
+                idx, max{}, init, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    return input[data_idx];
+                });
+            auto batch_sum =
+                block_reduce<max_block_size>(idx, sum{}, 0, batch_item_num, [&](auto j) __device__ {
+                    data_idx[axis] = j;
+                    auto val       = input[data_idx] - batch_max;
+                    return ::exp(to_hip_type(val));
+                });
+            idx.local_stride(batch_item_num, [&](auto j) {
+                data_idx[axis]   = j;
+                auto val         = input[data_idx] - batch_max;
+                output[data_idx] = ::exp(to_hip_type(val)) / batch_sum;
+            });
+        });
+    });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/sqdiff.cpp
+++ b/src/targets/gpu/device/sqdiff.cpp
+#include <migraphx/gpu/device/sqdiff.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void sqdiff(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)([](auto x, auto y) { return (x - y) * (x - y); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx