Unverified Commit 6e2df9de authored by kahmed10's avatar kahmed10 Committed by GitHub
Browse files

Dpp opts for wavefront 32 (#951)

Checks wavefront size, then changes implementation and number of threads for DPP reduce
parent 95431eb7
...@@ -4,7 +4,7 @@ CheckOptions: ...@@ -4,7 +4,7 @@ CheckOptions:
- key: bugprone-unused-return-value.CheckedFunctions - key: bugprone-unused-return-value.CheckedFunctions
value: '::std::async;::std::launder;::std::remove;::std::remove_if;::std::unique;::std::unique_ptr::release;::std::basic_string::empty;::std::vector::empty;::std::find;::std::find_if;::std::find_if_not;::std::all_of;::std::any_of;::std::none_of;::std::count;::std::count_if;::std::mismatch;::std::find_end;::std::find_first_of;::std::adjacent_find;::std::search;::std::search_n;::std::nth_element;::std::lower_bound;::std::upper_bound;::std::binary_search;::std::equal_range;::std::max;::std::max_element;::std::min;::std::min_element;::std::minmax;::std::minmax_element;::std::equal;::std::lexicographical_compare;::std::accumulate;::std::inner_product' value: '::std::async;::std::launder;::std::remove;::std::remove_if;::std::unique;::std::unique_ptr::release;::std::basic_string::empty;::std::vector::empty;::std::find;::std::find_if;::std::find_if_not;::std::all_of;::std::any_of;::std::none_of;::std::count;::std::count_if;::std::mismatch;::std::find_end;::std::find_first_of;::std::adjacent_find;::std::search;::std::search_n;::std::nth_element;::std::lower_bound;::std::upper_bound;::std::binary_search;::std::equal_range;::std::max;::std::max_element;::std::min;::std::min_element;::std::minmax;::std::minmax_element;::std::equal;::std::lexicographical_compare;::std::accumulate;::std::inner_product'
- key: cppcoreguidelines-macro-usage.AllowedRegexp - key: cppcoreguidelines-macro-usage.AllowedRegexp
value: 'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED' value: 'DEBUG|FALLTHROUGH|STRINGIZE|_HAS_|_THROW|_REQUIRES|_DECLARE_|_VISIT_|_REGISTER_|_GENERATE_|_DETAIL_|_TIDY_|_MANAGE_PTR|_MATCHER|DEVICE_SHARED|_WORKAROUND_'
- key: modernize-loop-convert.MinConfidence - key: modernize-loop-convert.MinConfidence
value: risky value: risky
- key: modernize-loop-convert.NamingStyle - key: modernize-loop-convert.NamingStyle
......
...@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS { ...@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
namespace device { namespace device {
#if __AMDGCN_WAVEFRONT_SIZE == 32
#define MIGRAPHX_NO_DPP
#endif
#ifdef MIGRAPHX_NO_DPP #ifdef MIGRAPHX_NO_DPP
template <index_int N, template <index_int N,
class Op, class Op,
...@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op) ...@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
in = op(in, out); in = op(in, out);
out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in); out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
in = op(in, out); in = op(in, out);
#if __AMDGCN_WAVEFRONT_SIZE == 64
out = dpp_mov<dpp_row_bcast(15), 0xa>(in); out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
in = op(in, out); in = op(in, out);
out = dpp_mov<dpp_row_bcast(31), 0xc>(in); out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
in = op(in, out); in = op(in, out);
#endif
} }
__device__ inline void dpp_reduce(float& x, sum) __device__ inline void dpp_reduce(float& x, sum)
...@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum) ...@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
"s_nop 1\n" "s_nop 1\n"
"v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n" "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
"s_nop 1\n" "s_nop 1\n"
#if __AMDGCN_WAVEFRONT_SIZE == 64
"v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n" "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
"s_nop 1\n" "s_nop 1\n"
"v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n" "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
#endif
"s_nop 1\n" "s_nop 1\n"
: "=v"(x) : "=v"(x)
: "0"(x)); : "0"(x));
...@@ -135,21 +135,27 @@ template <index_int N, ...@@ -135,21 +135,27 @@ template <index_int N,
MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})> MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
__device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f) __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
{ {
using type = decltype(f(deduce_for_stride(fs)));
MIGRAPHX_DEVICE_SHARED type buffer[N / 64]; #if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr index_int nthreads = 16;
#else
constexpr index_int nthreads = 64;
#endif
using type = decltype(f(deduce_for_stride(fs)));
MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
type x = init; type x = init;
fs([&](auto i) { x = op(x, f(i)); }); fs([&](auto i) { x = op(x, f(i)); });
dpp_reduce(x, op); dpp_reduce(x, op);
const auto ldsidx = idx.local / 64; const auto ldsidx = idx.local / nthreads;
if((idx.local % 64) == 63) if((idx.local % nthreads) == nthreads - 1)
{ {
buffer[ldsidx] = x; buffer[ldsidx] = x;
} }
__syncthreads(); __syncthreads();
type y = init; type y = init;
for(index_int i = 0; i < idx.nlocal() / 64; i++) for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
{ {
y = op(y, buffer[i]); y = op(y, buffer[i]);
} }
......
...@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS { ...@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
namespace device { namespace device {
#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
#if __AMDGCN_WAVEFRONT_SIZE == 32
#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
#else
#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
#endif
#endif
template <class T> template <class T>
struct vector_type struct vector_type
{ {
...@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i, ...@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
const bool in_range = idx.local < relements_v; const bool in_range = idx.local < relements_v;
auto mean = [&](auto z) { auto mean = [&](auto z) {
return auto_block_reduce<MaxBlockSize>( auto m = auto_block_reduce<MaxBlockSize>(
idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) / idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
value_type(relements); value_type(relements);
#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
__builtin_amdgcn_s_barrier();
#endif
return m;
}; };
// m = x - mean(x) // m = x - mean(x)
......
...@@ -81,3 +81,20 @@ struct test_layernorm_triadd : verify_program<test_layernorm_triadd> ...@@ -81,3 +81,20 @@ struct test_layernorm_triadd : verify_program<test_layernorm_triadd>
return p; return p;
} }
}; };
struct test_layernorm_triadd_large : verify_program<test_layernorm_triadd_large>
{
migraphx::program create_program() const
{
migraphx::program p;
auto* mm = p.get_main_module();
std::vector<size_t> dims = {1, 384, 1024};
auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, dims});
auto y = mm->add_parameter("y", migraphx::shape{migraphx::shape::float_type, dims});
auto z = mm->add_parameter("z", migraphx::shape{migraphx::shape::float_type, dims});
auto add1 = mm->add_instruction(migraphx::make_op("add"), x, y);
auto add2 = mm->add_instruction(migraphx::make_op("add"), add1, z);
add_layernorm(*mm, add2, dims);
return p;
}
};
...@@ -27,3 +27,16 @@ template struct test_reduce_op_large<migraphx::op::reduce_mean, 1, migraphx::sha ...@@ -27,3 +27,16 @@ template struct test_reduce_op_large<migraphx::op::reduce_mean, 1, migraphx::sha
template struct test_reduce_op_large<migraphx::op::reduce_min, 1, migraphx::shape::float_type>; template struct test_reduce_op_large<migraphx::op::reduce_min, 1, migraphx::shape::float_type>;
template struct test_reduce_op_large<migraphx::op::reduce_prod, 2, migraphx::shape::float_type>; template struct test_reduce_op_large<migraphx::op::reduce_prod, 2, migraphx::shape::float_type>;
template struct test_reduce_op_large<migraphx::op::reduce_sum, 1, migraphx::shape::float_type>; template struct test_reduce_op_large<migraphx::op::reduce_sum, 1, migraphx::shape::float_type>;
struct test_reduce_mean : verify_program<test_reduce_mean>
{
migraphx::program create_program() const
{
migraphx::program p;
auto* mm = p.get_main_module();
migraphx::shape s{migraphx::shape::float_type, {1, 384, 1024}};
auto x = mm->add_parameter("x", s);
mm->add_instruction(migraphx::op::reduce_mean{{1}}, x);
return p;
};
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment