Merge remote-tracking branch 'origin/jit-layernorm-merge' into bert-perf

dca000fe · turneram · 8309364c · e5a25712 · dca000fe · dca000fe
Commit dca000fe authored Aug 23, 2022 by turneram
4 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -33,49 +33,91 @@
 namespace migraphx {
 // NOLINTNEXTLINE
-#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                    \
+#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                             \
-    template <class U>                                                             \
+    template <class U>                                                                      \
-    constexpr array& operator op(const array<U, N>& x)                             \
+    constexpr array& operator op(const array<U, N>& x)                                      \
-    {                                                                              \
+    {                                                                                       \
-        for(index_int i = 0; i < N; i++)                                           \
+        array_for_each(*this, x)([](auto& sy, auto sx) { sy op sx; });                      \
-            d[i] op x[i];                                                          \
+        return *this;                                                                       \
-        return *this;                                                              \
+    }                                                                                       \
-    }                                                                              \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                           \
-    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
+    constexpr array& operator op(const U& x)                                                \
-    constexpr array& operator op(const U& x)                                       \
+    {                                                                                       \
-    {                                                                              \
+        array_for_each (*this)([&](auto& sy) { sy op x; });                                 \
-        for(index_int i = 0; i < N; i++)                                           \
+        return *this;                                                                       \
-            d[i] op x;                                                             \
+    }                                                                                       \
-        return *this;                                                              \
+    template <class U>                                                                      \
-    }                                                                              \
+    friend constexpr auto operator binary_op(const array& x, const array<U, N>& y)          \
-    template <class U>                                                             \
+    {                                                                                       \
-    friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
+        array<decltype(T {} binary_op U{}), N> z{};                                         \
-    {                                                                              \
+        array_for_each(z, x, y)([&](auto& sz, auto sx, auto sy) { sz = sx binary_op sy; }); \
-        array<decltype(T {} binary_op U{}), N> z{};                                \
+        return z;                                                                           \
-        for(index_int i = 0; i < N; i++)                                           \
+    }                                                                                       \
-            z[i] = x[i] binary_op y[i];                                            \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                           \
-        return z;                                                                  \
+    friend constexpr auto operator binary_op(const array& x, const U& y)                    \
-    }                                                                              \
+    {                                                                                       \
-    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
+        array<decltype(T {} binary_op U{}), N> z{};                                         \
-    friend constexpr auto operator binary_op(const array& x, const U& y)           \
+        array_for_each(z, x)([&](auto& sz, auto sx) { sz = sx binary_op y; });              \
-    {                                                                              \
+        return z;                                                                           \
-        array<decltype(T {} binary_op U{}), N> z{};                                \
+    }                                                                                       \
-        for(index_int i = 0; i < N; i++)                                           \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                           \
-            z[i] = x[i] binary_op y;                                               \
+    friend constexpr auto operator binary_op(const U& x, const array& y)                    \
-        return z;                                                                  \
+    {                                                                                       \
-    }                                                                              \
+        array<decltype(T {} binary_op U{}), N> z{};                                         \
-    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
+        array_for_each(z, y)([&](auto& sz, auto sy) { sz = x binary_op sy; });              \
-    friend constexpr auto operator binary_op(const U& x, const array& y)           \
+        return z;                                                                           \
-    {                                                                              \
-        array<decltype(T {} binary_op U{}), N> z{};                                \
-        for(index_int i = 0; i < N; i++)                                           \
-            z[i] = x binary_op y[i];                                               \
-        return z;                                                                  \
    }
+template <class T>
+constexpr auto is_vectorizable()
+{
+    return not is_same<T, bool>{} and (is_fundamental<T>{} or is_same<T, half>{});
+}
+template <class T>
+__device__ auto& array2vec(T& x)
+{
+    using value_type    = typename T::value_type;
+    constexpr auto size = decltype(x.size()){};
+    using type          = vec<value_type, size>;
+    if constexpr(is_const<T>{})
+        return reinterpret_cast<const type&>(x);
+    else
+        return reinterpret_cast<type&>(x);
+}
+template <class T, class... Ts>
+constexpr auto array_for_each(T& x, Ts&... xs)
+{
+    MIGRAPHX_ASSERT((x.size() == xs.size() and ...));
+    return [&](auto f) {
+        constexpr auto size = decltype(x.size()){};
+        if constexpr((is_vectorizable<typename T::value_type>() or
+                      (is_vectorizable<typename Ts::value_type>() or ...)) and
+                     size <= 8 and size > 1 and (size % 2 == 0))
+        {
+            if(__builtin_is_constant_evaluated())
+            {
+                for(index_int i = 0; i < size; i++)
+                    f(x[i], xs[i]...);
+            }
+            else
+            {
+                f(array2vec(x), array2vec(xs)...);
+            }
+        }
+        else
+        {
+            for(index_int i = 0; i < size; i++)
+                f(x[i], xs[i]...);
+        }
+    };
+}
 template <class T, index_int N>
 struct array
 {
+    using value_type = T;
    T d[N];
    constexpr T& operator[](index_int i)
    {
@@ -108,18 +150,13 @@ struct array
    constexpr T dot(const array& x) const
    {
-        T result = 0;
+        auto r = x * (*this);
-        for(index_int i = 0; i < N; i++)
+        return r.reduce([](auto a, auto b) { return a + b; }, 0);
-            result += x[i] * d[i];
-        return result;
    }
    constexpr T product() const
    {
-        T result = 1;
+        return reduce([](auto x, auto y) { return x * y; }, 1);
-        for(index_int i = 0; i < N; i++)
-            result *= d[i];
-        return result;
    }
    constexpr T single(index_int width = 100) const
@@ -134,6 +171,24 @@ struct array
        return result;
    }
+    template <class F>
+    constexpr auto apply(F f) const
+    {
+        array<decltype(f(d[0])), N> result;
+        for(index_int i = 0; i < N; i++)
+            result[i] = f(d[i]);
+        return result;
+    }
+    template <class F>
+    constexpr auto reduce(F f, T init) const
+    {
+        T result = init;
+        for(index_int i = 0; i < N; i++)
+            result = f(result, d[i]);
+        return result;
+    }
    MIGRAPHX_DEVICE_ARRAY_OP(+=, +)
    MIGRAPHX_DEVICE_ARRAY_OP(-=, -)
    MIGRAPHX_DEVICE_ARRAY_OP(*=, *)
@@ -201,6 +256,11 @@ struct array
    }
 };
+template <class T, class... Ts>
+constexpr array<T, sizeof...(Ts) + 1> make_array(T x, Ts... xs)
+{
+    return {x, static_cast<T>(xs)...};
+}
 template <class T, T... Xs>
 struct integral_const_array : array<T, sizeof...(Xs)>
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -63,6 +63,7 @@ struct index
    template <class F, class N, class Stride>
    static constexpr void for_stride(index_int start, N n, Stride stride, F f)
    {
+        static_assert(not is_integral<N>{}, "");
        if constexpr(not is_integral<N>{} and not is_integral<Stride>{} and
                     max_stride_iterations(n, stride) == 1)
        {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -29,6 +29,12 @@
 namespace migraphx {
+template <class T, index_int N, class Op>
+constexpr auto vec_reduce(const array<T, N>& a, Op op)
+{
+    return a.apply([&](auto x) { return vec_reduce(x, op); });
+}
 template <index_int Axis,
          class F,
          class BinOp,
@@ -43,23 +49,20 @@ __device__ void generic_binary_layernorm(
    reduce::block::run<reduce_output>([&](auto, auto r) {
        using value_type         = typename Input1::type;
        constexpr auto relements = r.template elements<Input1>();
-        auto mean                = [&](auto f) {
+        auto means =
-            return r.reduce(op::sum{}, 0, [&](auto x1, auto x2) {
+            r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x1, auto x2) {
-                return f(x1, x2) / value_type{relements};
+                auto x = op(x1, x2);
+                return make_array(x, x * x) * vec_type<value_type>{1.0 / relements};
            })(input1, input2);
-        };
-        // mean(x)
+        auto mean_x  = means[0];
-        auto mean_x = mean(op);
+        auto mean_x2 = means[1];
-        // mean(m ^ 2)
-        auto mean_m2 = mean([&](auto x1, auto x2) {
-            auto m = op(x1, x2) - mean_x;
-            return m * m;
-        });
        r.inner([&](auto& y, auto x1, auto x2, auto... xs) {
-            auto m = op(x1, x2) - mean_x;
+            auto x = op(x1, x2);
+            auto m = x - mean_x;
            // m * rsqrt(mean(m ^ 2) + 1e-12)
-            y = compute(m * rsqrt(mean_m2 + value_type{1e-12}), xs...);
+            y = compute(m * rsqrt(mean_x2 - mean_x + value_type{1e-12}), xs...);
        })(output, input1, input2, inputs...);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -94,8 +94,8 @@ MIGRAPHX_DPP_REDUCE(op::max, v_max)
 MIGRAPHX_DPP_REDUCE(op::min, v_min)
 MIGRAPHX_DPP_REDUCE(op::product, v_mul)
-template <class Op, class T, class F>
+template <class Op, class T, class Index, class F>
-__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
+__device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 {
 #if __AMDGCN_WAVEFRONT_SIZE == 32
    constexpr index_int lanes_per_thread = 16;
@@ -123,8 +123,8 @@ __device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
    return y;
 }
 #else
-template <class Op, class T, class F>
+template <class Op, class T, class Index, class F>
-__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
+__device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 {
    using type = decltype(f(0));
@@ -200,13 +200,10 @@ struct block
        template <class Op, class T, class Read>
        __device__ auto reduce(Op op, T init, Read read) const
        {
-            return sliced(slice, [=](auto x, auto... xs) {
+            return sliced(slicer, [=](auto x, auto... xs) {
-                return vec_reduce(block_reduce(idx,
+                return block_reduce(idx, op, init, x.get_shape().elements(), [&](auto j) {
-                                               op,
+                    return vec_reduce(read(x[j], xs[j]...), op);
-                                               init,
+                });
-                                               x.get_shape().elements(),
-                                               [&](auto j) { return read(x[j], xs[j]...); }),
-                                  op);
            });
        }