Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
dca000fe
Commit
dca000fe
authored
Aug 23, 2022
by
turneram
Browse files
Merge remote-tracking branch 'origin/jit-layernorm-merge' into bert-perf
parents
8309364c
e5a25712
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
131 additions
and
70 deletions
+131
-70
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+106
-46
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+1
-0
src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
...argets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+16
-13
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+8
-11
No files found.
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
View file @
dca000fe
...
...
@@ -33,49 +33,91 @@
namespace
migraphx
{
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op) \
template <class U> \
constexpr array& operator op(const array<U, N>& x) \
{ \
for(index_int i = 0; i < N; i++) \
d[i] op x[i]; \
return *this; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
constexpr array& operator op(const U& x) \
{ \
for(index_int i = 0; i < N; i++) \
d[i] op x; \
return *this; \
} \
template <class U> \
friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x[i] binary_op y[i]; \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const array& x, const U& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x[i] binary_op y; \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const U& x, const array& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x binary_op y[i]; \
return z; \
#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op) \
template <class U> \
constexpr array& operator op(const array<U, N>& x) \
{ \
array_for_each(*this, x)([](auto& sy, auto sx) { sy op sx; }); \
return *this; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
constexpr array& operator op(const U& x) \
{ \
array_for_each (*this)([&](auto& sy) { sy op x; }); \
return *this; \
} \
template <class U> \
friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
array_for_each(z, x, y)([&](auto& sz, auto sx, auto sy) { sz = sx binary_op sy; }); \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const array& x, const U& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
array_for_each(z, x)([&](auto& sz, auto sx) { sz = sx binary_op y; }); \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const U& x, const array& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
array_for_each(z, y)([&](auto& sz, auto sy) { sz = x binary_op sy; }); \
return z; \
}
template
<
class
T
>
constexpr
auto
is_vectorizable
()
{
return
not
is_same
<
T
,
bool
>
{}
and
(
is_fundamental
<
T
>
{}
or
is_same
<
T
,
half
>
{});
}
template
<
class
T
>
__device__
auto
&
array2vec
(
T
&
x
)
{
using
value_type
=
typename
T
::
value_type
;
constexpr
auto
size
=
decltype
(
x
.
size
()){};
using
type
=
vec
<
value_type
,
size
>
;
if
constexpr
(
is_const
<
T
>
{})
return
reinterpret_cast
<
const
type
&>
(
x
);
else
return
reinterpret_cast
<
type
&>
(
x
);
}
template
<
class
T
,
class
...
Ts
>
constexpr
auto
array_for_each
(
T
&
x
,
Ts
&
...
xs
)
{
MIGRAPHX_ASSERT
((
x
.
size
()
==
xs
.
size
()
and
...));
return
[
&
](
auto
f
)
{
constexpr
auto
size
=
decltype
(
x
.
size
()){};
if
constexpr
((
is_vectorizable
<
typename
T
::
value_type
>
()
or
(
is_vectorizable
<
typename
Ts
::
value_type
>
()
or
...))
and
size
<=
8
and
size
>
1
and
(
size
%
2
==
0
))
{
if
(
__builtin_is_constant_evaluated
())
{
for
(
index_int
i
=
0
;
i
<
size
;
i
++
)
f
(
x
[
i
],
xs
[
i
]...);
}
else
{
f
(
array2vec
(
x
),
array2vec
(
xs
)...);
}
}
else
{
for
(
index_int
i
=
0
;
i
<
size
;
i
++
)
f
(
x
[
i
],
xs
[
i
]...);
}
};
}
template
<
class
T
,
index_int
N
>
struct
array
{
using
value_type
=
T
;
T
d
[
N
];
constexpr
T
&
operator
[](
index_int
i
)
{
...
...
@@ -108,18 +150,13 @@ struct array
constexpr
T
dot
(
const
array
&
x
)
const
{
T
result
=
0
;
for
(
index_int
i
=
0
;
i
<
N
;
i
++
)
result
+=
x
[
i
]
*
d
[
i
];
return
result
;
auto
r
=
x
*
(
*
this
);
return
r
.
reduce
([](
auto
a
,
auto
b
)
{
return
a
+
b
;
},
0
);
}
constexpr
T
product
()
const
{
T
result
=
1
;
for
(
index_int
i
=
0
;
i
<
N
;
i
++
)
result
*=
d
[
i
];
return
result
;
return
reduce
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
1
);
}
constexpr
T
single
(
index_int
width
=
100
)
const
...
...
@@ -134,6 +171,24 @@ struct array
return
result
;
}
template
<
class
F
>
constexpr
auto
apply
(
F
f
)
const
{
array
<
decltype
(
f
(
d
[
0
])),
N
>
result
;
for
(
index_int
i
=
0
;
i
<
N
;
i
++
)
result
[
i
]
=
f
(
d
[
i
]);
return
result
;
}
template
<
class
F
>
constexpr
auto
reduce
(
F
f
,
T
init
)
const
{
T
result
=
init
;
for
(
index_int
i
=
0
;
i
<
N
;
i
++
)
result
=
f
(
result
,
d
[
i
]);
return
result
;
}
MIGRAPHX_DEVICE_ARRAY_OP
(
+=
,
+
)
MIGRAPHX_DEVICE_ARRAY_OP
(
-=
,
-
)
MIGRAPHX_DEVICE_ARRAY_OP
(
*=
,
*
)
...
...
@@ -201,6 +256,11 @@ struct array
}
};
template
<
class
T
,
class
...
Ts
>
constexpr
array
<
T
,
sizeof
...(
Ts
)
+
1
>
make_array
(
T
x
,
Ts
...
xs
)
{
return
{
x
,
static_cast
<
T
>
(
xs
)...};
}
template
<
class
T
,
T
...
Xs
>
struct
integral_const_array
:
array
<
T
,
sizeof
...(
Xs
)
>
{
...
...
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
View file @
dca000fe
...
...
@@ -63,6 +63,7 @@ struct index
template
<
class
F
,
class
N
,
class
Stride
>
static
constexpr
void
for_stride
(
index_int
start
,
N
n
,
Stride
stride
,
F
f
)
{
static_assert
(
not
is_integral
<
N
>
{},
""
);
if
constexpr
(
not
is_integral
<
N
>
{}
and
not
is_integral
<
Stride
>
{}
and
max_stride_iterations
(
n
,
stride
)
==
1
)
{
...
...
src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
View file @
dca000fe
...
...
@@ -29,6 +29,12 @@
namespace
migraphx
{
template
<
class
T
,
index_int
N
,
class
Op
>
constexpr
auto
vec_reduce
(
const
array
<
T
,
N
>&
a
,
Op
op
)
{
return
a
.
apply
([
&
](
auto
x
)
{
return
vec_reduce
(
x
,
op
);
});
}
template
<
index_int
Axis
,
class
F
,
class
BinOp
,
...
...
@@ -43,23 +49,20 @@ __device__ void generic_binary_layernorm(
reduce
::
block
::
run
<
reduce_output
>
([
&
](
auto
,
auto
r
)
{
using
value_type
=
typename
Input1
::
type
;
constexpr
auto
relements
=
r
.
template
elements
<
Input1
>();
auto
mean
=
[
&
](
auto
f
)
{
return
r
.
reduce
(
op
::
sum
{},
0
,
[
&
](
auto
x1
,
auto
x2
)
{
return
f
(
x1
,
x2
)
/
value_type
{
relements
};
auto
means
=
r
.
reduce
(
op
::
sum
{},
make_array
<
vec_type
<
value_type
>>
(
0
,
0
),
[
&
](
auto
x1
,
auto
x2
)
{
auto
x
=
op
(
x1
,
x2
);
return
make_array
(
x
,
x
*
x
)
*
vec_type
<
value_type
>
{
1.0
/
relements
};
})(
input1
,
input2
);
};
// mean(x)
auto
mean_x
=
mean
(
op
);
// mean(m ^ 2)
auto
mean_m2
=
mean
([
&
](
auto
x1
,
auto
x2
)
{
auto
m
=
op
(
x1
,
x2
)
-
mean_x
;
return
m
*
m
;
});
auto
mean_x
=
means
[
0
];
auto
mean_x2
=
means
[
1
];
r
.
inner
([
&
](
auto
&
y
,
auto
x1
,
auto
x2
,
auto
...
xs
)
{
auto
m
=
op
(
x1
,
x2
)
-
mean_x
;
auto
x
=
op
(
x1
,
x2
);
auto
m
=
x
-
mean_x
;
// m * rsqrt(mean(m ^ 2) + 1e-12)
y
=
compute
(
m
*
rsqrt
(
mean_
m2
+
value_type
{
1e-12
}),
xs
...);
y
=
compute
(
m
*
rsqrt
(
mean_
x2
-
mean_x
+
value_type
{
1e-12
}),
xs
...);
})(
output
,
input1
,
input2
,
inputs
...);
});
}
...
...
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
View file @
dca000fe
...
...
@@ -94,8 +94,8 @@ MIGRAPHX_DPP_REDUCE(op::max, v_max)
MIGRAPHX_DPP_REDUCE
(
op
::
min
,
v_min
)
MIGRAPHX_DPP_REDUCE
(
op
::
product
,
v_mul
)
template
<
class
Op
,
class
T
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
i
ndex
_int
n
,
F
f
)
template
<
class
Op
,
class
T
,
class
Index
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
I
ndex
n
,
F
f
)
{
#if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr
index_int
lanes_per_thread
=
16
;
...
...
@@ -123,8 +123,8 @@ __device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
return
y
;
}
#else
template
<
class
Op
,
class
T
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
i
ndex
_int
n
,
F
f
)
template
<
class
Op
,
class
T
,
class
Index
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
I
ndex
n
,
F
f
)
{
using
type
=
decltype
(
f
(
0
));
...
...
@@ -200,13 +200,10 @@ struct block
template
<
class
Op
,
class
T
,
class
Read
>
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
{
return
sliced
(
slice
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
vec_reduce
(
block_reduce
(
idx
,
op
,
init
,
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
return
read
(
x
[
j
],
xs
[
j
]...);
}),
op
);
return
sliced
(
slicer
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
block_reduce
(
idx
,
op
,
init
,
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
return
vec_reduce
(
read
(
x
[
j
],
xs
[
j
]...),
op
);
});
});
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment