Dpp opts for wavefront 32 (#951)

Checks wavefront size, then changes implementation and number of threads for DPP reduce

Dpp opts for wavefront 32 (#951)
Checks wavefront size, then changes implementation and number of threads for DPP reduce
6e2df9de · kahmed10 · GitHub · 95431eb7 · 6e2df9de · 6e2df9de
Unverified Commit 6e2df9de authored Sep 27, 2021 by kahmed10 Committed by GitHub Sep 27, 2021
5 changed files
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -4,7 +4,7 @@ CheckOptions:
  - key:             bugprone-unused-return-value.CheckedFunctions
    value:           '::std::async;::std::launder;::std::remove;::std::remove_if;::std::unique;::std::unique_ptr::release;::std::basic_string::empty;::std::vector::empty;::std::find;::std::find_if;::std::find_if_not;::std::all_of;::std::any_of;::std::none_of;::std::count;::std::count_if;::std::mismatch;::std::find_end;::std::find_first_of;::std::adjacent_find;::std::search;::std::search_n;::std::nth_element;::std::lower_bound;::std::upper_bound;::std::binary_search;::std::equal_range;::std::max;::std::max_element;::std::min;::std::min_element;::std::minmax;::std::minmax_element;::std::equal;::std::lexicographical_compare;::std::accumulate;::std::inner_product'
  - key:             cppcoreguidelines-macro-usage.AllowedRegexp
-    value:           'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED'
+    value:           'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED|_WORKAROUND_'
  - key:             modernize-loop-convert.MinConfidence
    value:           risky   
  - key:             modernize-loop-convert.NamingStyle

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-#if __AMDGCN_WAVEFRONT_SIZE == 32
-#define MIGRAPHX_NO_DPP
-#endif
 #ifdef MIGRAPHX_NO_DPP
 template <index_int N,
          class Op,
@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
    in  = op(in, out);
+#if __AMDGCN_WAVEFRONT_SIZE == 64
    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
    in  = op(in, out);
+#endif
 }
 __device__ inline void dpp_reduce(float& x, sum)
@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
                     "s_nop 1\n"
+#if __AMDGCN_WAVEFRONT_SIZE == 64
                     "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+#endif
                     "s_nop 1\n"
                     : "=v"(x)
                     : "0"(x));
@@ -135,21 +135,27 @@ template <index_int N,
          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
 __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
 {
-    using type = decltype(f(deduce_for_stride(fs)));
-    MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+    constexpr index_int nthreads = 16;
+#else
+    constexpr index_int nthreads = 64;
+#endif
+    using type                   = decltype(f(deduce_for_stride(fs)));
+    MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
    type x = init;
    fs([&](auto i) { x = op(x, f(i)); });
    dpp_reduce(x, op);
-    const auto ldsidx = idx.local / 64;
+    const auto ldsidx = idx.local / nthreads;
-    if((idx.local % 64) == 63)
+    if((idx.local % nthreads) == nthreads - 1)
    {
        buffer[ldsidx] = x;
    }
    __syncthreads();
    type y = init;
-    for(index_int i = 0; i < idx.nlocal() / 64; i++)
+    for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
    {
        y = op(y, buffer[i]);
    }

--- a/src/targets/gpu/device/layernorm.cpp
+++ b/src/targets/gpu/device/layernorm.cpp
@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
+#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
+#else
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
+#endif
+#endif
 template <class T>
 struct vector_type
 {
@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
    const bool in_range    = idx.local < relements_v;
    auto mean = [&](auto z) {
-        return auto_block_reduce<MaxBlockSize>(
+        auto m = auto_block_reduce<MaxBlockSize>(
-                   idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
+                     idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
-               value_type(relements);
+                 value_type(relements);
+#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+        __builtin_amdgcn_s_barrier();
+#endif
+        return m;
    };
    // m = x - mean(x)

--- a/test/verify/test_layernorm.cpp
+++ b/test/verify/test_layernorm.cpp
@@ -81,3 +81,20 @@ struct test_layernorm_triadd : verify_program<test_layernorm_triadd>
        return p;
    }
 };
+struct test_layernorm_triadd_large : verify_program<test_layernorm_triadd_large>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm                 = p.get_main_module();
+        std::vector<size_t> dims = {1, 384, 1024};
+        auto x    = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
+        auto y    = mm->add_parameter("y", migraphx::shape{migraphx::shape::float_type, dims});
+        auto z    = mm->add_parameter("z", migraphx::shape{migraphx::shape::float_type, dims});
+        auto add1 = mm->add_instruction(migraphx::make_op("add"), x, y);
+        auto add2 = mm->add_instruction(migraphx::make_op("add"), add1, z);
+        add_layernorm(*mm, add2, dims);
+        return p;
+    }
+};
--- a/test/verify/test_reduce_op_large.cpp
+++ b/test/verify/test_reduce_op_large.cpp
@@ -27,3 +27,16 @@ template struct test_reduce_op_large<migraphx::op::reduce_mean, 1, migraphx::sha
 template struct test_reduce_op_large<migraphx::op::reduce_min, 1, migraphx::shape::float_type>;
 template struct test_reduce_op_large<migraphx::op::reduce_prod, 2, migraphx::shape::float_type>;
 template struct test_reduce_op_large<migraphx::op::reduce_sum, 1, migraphx::shape::float_type>;
+struct test_reduce_mean : verify_program<test_reduce_mean>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape s{migraphx::shape::float_type, {1, 384, 1024}};
+        auto x = mm->add_parameter("x", s);
+        mm->add_instruction(migraphx::op::reduce_mean{{1}}, x);
+        return p;
+    };
+};