Commit dca000fe authored by turneram's avatar turneram
Browse files

Merge remote-tracking branch 'origin/jit-layernorm-merge' into bert-perf

parents 8309364c e5a25712
...@@ -33,49 +33,91 @@ ...@@ -33,49 +33,91 @@
namespace migraphx { namespace migraphx {
// NOLINTNEXTLINE // NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op) \ #define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op) \
template <class U> \ template <class U> \
constexpr array& operator op(const array<U, N>& x) \ constexpr array& operator op(const array<U, N>& x) \
{ \ { \
for(index_int i = 0; i < N; i++) \ array_for_each(*this, x)([](auto& sy, auto sx) { sy op sx; }); \
d[i] op x[i]; \ return *this; \
return *this; \ } \
} \ template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \ constexpr array& operator op(const U& x) \
constexpr array& operator op(const U& x) \ { \
{ \ array_for_each (*this)([&](auto& sy) { sy op x; }); \
for(index_int i = 0; i < N; i++) \ return *this; \
d[i] op x; \ } \
return *this; \ template <class U> \
} \ friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
template <class U> \ { \
friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \ array<decltype(T {} binary_op U{}), N> z{}; \
{ \ array_for_each(z, x, y)([&](auto& sz, auto sx, auto sy) { sz = sx binary_op sy; }); \
array<decltype(T {} binary_op U{}), N> z{}; \ return z; \
for(index_int i = 0; i < N; i++) \ } \
z[i] = x[i] binary_op y[i]; \ template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
return z; \ friend constexpr auto operator binary_op(const array& x, const U& y) \
} \ { \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \ array<decltype(T {} binary_op U{}), N> z{}; \
friend constexpr auto operator binary_op(const array& x, const U& y) \ array_for_each(z, x)([&](auto& sz, auto sx) { sz = sx binary_op y; }); \
{ \ return z; \
array<decltype(T {} binary_op U{}), N> z{}; \ } \
for(index_int i = 0; i < N; i++) \ template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
z[i] = x[i] binary_op y; \ friend constexpr auto operator binary_op(const U& x, const array& y) \
return z; \ { \
} \ array<decltype(T {} binary_op U{}), N> z{}; \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \ array_for_each(z, y)([&](auto& sz, auto sy) { sz = x binary_op sy; }); \
friend constexpr auto operator binary_op(const U& x, const array& y) \ return z; \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x binary_op y[i]; \
return z; \
} }
template <class T>
constexpr auto is_vectorizable()
{
return not is_same<T, bool>{} and (is_fundamental<T>{} or is_same<T, half>{});
}
template <class T>
__device__ auto& array2vec(T& x)
{
using value_type = typename T::value_type;
constexpr auto size = decltype(x.size()){};
using type = vec<value_type, size>;
if constexpr(is_const<T>{})
return reinterpret_cast<const type&>(x);
else
return reinterpret_cast<type&>(x);
}
template <class T, class... Ts>
constexpr auto array_for_each(T& x, Ts&... xs)
{
MIGRAPHX_ASSERT((x.size() == xs.size() and ...));
return [&](auto f) {
constexpr auto size = decltype(x.size()){};
if constexpr((is_vectorizable<typename T::value_type>() or
(is_vectorizable<typename Ts::value_type>() or ...)) and
size <= 8 and size > 1 and (size % 2 == 0))
{
if(__builtin_is_constant_evaluated())
{
for(index_int i = 0; i < size; i++)
f(x[i], xs[i]...);
}
else
{
f(array2vec(x), array2vec(xs)...);
}
}
else
{
for(index_int i = 0; i < size; i++)
f(x[i], xs[i]...);
}
};
}
template <class T, index_int N> template <class T, index_int N>
struct array struct array
{ {
using value_type = T;
T d[N]; T d[N];
constexpr T& operator[](index_int i) constexpr T& operator[](index_int i)
{ {
...@@ -108,18 +150,13 @@ struct array ...@@ -108,18 +150,13 @@ struct array
constexpr T dot(const array& x) const constexpr T dot(const array& x) const
{ {
T result = 0; auto r = x * (*this);
for(index_int i = 0; i < N; i++) return r.reduce([](auto a, auto b) { return a + b; }, 0);
result += x[i] * d[i];
return result;
} }
constexpr T product() const constexpr T product() const
{ {
T result = 1; return reduce([](auto x, auto y) { return x * y; }, 1);
for(index_int i = 0; i < N; i++)
result *= d[i];
return result;
} }
constexpr T single(index_int width = 100) const constexpr T single(index_int width = 100) const
...@@ -134,6 +171,24 @@ struct array ...@@ -134,6 +171,24 @@ struct array
return result; return result;
} }
template <class F>
constexpr auto apply(F f) const
{
array<decltype(f(d[0])), N> result;
for(index_int i = 0; i < N; i++)
result[i] = f(d[i]);
return result;
}
template <class F>
constexpr auto reduce(F f, T init) const
{
T result = init;
for(index_int i = 0; i < N; i++)
result = f(result, d[i]);
return result;
}
MIGRAPHX_DEVICE_ARRAY_OP(+=, +) MIGRAPHX_DEVICE_ARRAY_OP(+=, +)
MIGRAPHX_DEVICE_ARRAY_OP(-=, -) MIGRAPHX_DEVICE_ARRAY_OP(-=, -)
MIGRAPHX_DEVICE_ARRAY_OP(*=, *) MIGRAPHX_DEVICE_ARRAY_OP(*=, *)
...@@ -201,6 +256,11 @@ struct array ...@@ -201,6 +256,11 @@ struct array
} }
}; };
template <class T, class... Ts>
constexpr array<T, sizeof...(Ts) + 1> make_array(T x, Ts... xs)
{
return {x, static_cast<T>(xs)...};
}
template <class T, T... Xs> template <class T, T... Xs>
struct integral_const_array : array<T, sizeof...(Xs)> struct integral_const_array : array<T, sizeof...(Xs)>
{ {
......
...@@ -63,6 +63,7 @@ struct index ...@@ -63,6 +63,7 @@ struct index
template <class F, class N, class Stride> template <class F, class N, class Stride>
static constexpr void for_stride(index_int start, N n, Stride stride, F f) static constexpr void for_stride(index_int start, N n, Stride stride, F f)
{ {
static_assert(not is_integral<N>{}, "");
if constexpr(not is_integral<N>{} and not is_integral<Stride>{} and if constexpr(not is_integral<N>{} and not is_integral<Stride>{} and
max_stride_iterations(n, stride) == 1) max_stride_iterations(n, stride) == 1)
{ {
......
...@@ -29,6 +29,12 @@ ...@@ -29,6 +29,12 @@
namespace migraphx { namespace migraphx {
template <class T, index_int N, class Op>
constexpr auto vec_reduce(const array<T, N>& a, Op op)
{
return a.apply([&](auto x) { return vec_reduce(x, op); });
}
template <index_int Axis, template <index_int Axis,
class F, class F,
class BinOp, class BinOp,
...@@ -43,23 +49,20 @@ __device__ void generic_binary_layernorm( ...@@ -43,23 +49,20 @@ __device__ void generic_binary_layernorm(
reduce::block::run<reduce_output>([&](auto, auto r) { reduce::block::run<reduce_output>([&](auto, auto r) {
using value_type = typename Input1::type; using value_type = typename Input1::type;
constexpr auto relements = r.template elements<Input1>(); constexpr auto relements = r.template elements<Input1>();
auto mean = [&](auto f) { auto means =
return r.reduce(op::sum{}, 0, [&](auto x1, auto x2) { r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x1, auto x2) {
return f(x1, x2) / value_type{relements}; auto x = op(x1, x2);
return make_array(x, x * x) * vec_type<value_type>{1.0 / relements};
})(input1, input2); })(input1, input2);
};
// mean(x) auto mean_x = means[0];
auto mean_x = mean(op); auto mean_x2 = means[1];
// mean(m ^ 2)
auto mean_m2 = mean([&](auto x1, auto x2) {
auto m = op(x1, x2) - mean_x;
return m * m;
});
r.inner([&](auto& y, auto x1, auto x2, auto... xs) { r.inner([&](auto& y, auto x1, auto x2, auto... xs) {
auto m = op(x1, x2) - mean_x; auto x = op(x1, x2);
auto m = x - mean_x;
// m * rsqrt(mean(m ^ 2) + 1e-12) // m * rsqrt(mean(m ^ 2) + 1e-12)
y = compute(m * rsqrt(mean_m2 + value_type{1e-12}), xs...); y = compute(m * rsqrt(mean_x2 - mean_x + value_type{1e-12}), xs...);
})(output, input1, input2, inputs...); })(output, input1, input2, inputs...);
}); });
} }
......
...@@ -94,8 +94,8 @@ MIGRAPHX_DPP_REDUCE(op::max, v_max) ...@@ -94,8 +94,8 @@ MIGRAPHX_DPP_REDUCE(op::max, v_max)
MIGRAPHX_DPP_REDUCE(op::min, v_min) MIGRAPHX_DPP_REDUCE(op::min, v_min)
MIGRAPHX_DPP_REDUCE(op::product, v_mul) MIGRAPHX_DPP_REDUCE(op::product, v_mul)
template <class Op, class T, class F> template <class Op, class T, class Index, class F>
__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f) __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
{ {
#if __AMDGCN_WAVEFRONT_SIZE == 32 #if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr index_int lanes_per_thread = 16; constexpr index_int lanes_per_thread = 16;
...@@ -123,8 +123,8 @@ __device__ auto block_reduce(index idx, Op op, T init, index_int n, F f) ...@@ -123,8 +123,8 @@ __device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
return y; return y;
} }
#else #else
template <class Op, class T, class F> template <class Op, class T, class Index, class F>
__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f) __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
{ {
using type = decltype(f(0)); using type = decltype(f(0));
...@@ -200,13 +200,10 @@ struct block ...@@ -200,13 +200,10 @@ struct block
template <class Op, class T, class Read> template <class Op, class T, class Read>
__device__ auto reduce(Op op, T init, Read read) const __device__ auto reduce(Op op, T init, Read read) const
{ {
return sliced(slice, [=](auto x, auto... xs) { return sliced(slicer, [=](auto x, auto... xs) {
return vec_reduce(block_reduce(idx, return block_reduce(idx, op, init, x.get_shape().elements(), [&](auto j) {
op, return vec_reduce(read(x[j], xs[j]...), op);
init, });
x.get_shape().elements(),
[&](auto j) { return read(x[j], xs[j]...); }),
op);
}); });
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment