Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
3a4d36cf
Commit
3a4d36cf
authored
Sep 30, 2022
by
charlie
Browse files
Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_model_test
parents
6bec381f
e19f78ae
Changes
384
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
504 additions
and
249 deletions
+504
-249
src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
...argets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
+1
-1
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+111
-47
src/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
src/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
+66
-0
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
...rgets/gpu/kernels/include/migraphx/kernels/functional.hpp
+3
-2
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+81
-5
src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
...pu/kernels/include/migraphx/kernels/integral_constant.hpp
+3
-3
src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
...argets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+89
-0
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+7
-0
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
+2
-2
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+30
-13
src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
...gets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+5
-3
src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+1
-1
src/targets/gpu/lowering.cpp
src/targets/gpu/lowering.cpp
+5
-80
src/targets/gpu/mlir.cpp
src/targets/gpu/mlir.cpp
+2
-2
src/targets/gpu/pack_int8_args.cpp
src/targets/gpu/pack_int8_args.cpp
+1
-1
src/targets/gpu/prefuse_ops.cpp
src/targets/gpu/prefuse_ops.cpp
+59
-37
src/targets/gpu/quant_convolution.cpp
src/targets/gpu/quant_convolution.cpp
+0
-1
src/targets/gpu/rocblas.cpp
src/targets/gpu/rocblas.cpp
+33
-0
src/targets/gpu/softmax.cpp
src/targets/gpu/softmax.cpp
+0
-49
src/targets/gpu/target.cpp
src/targets/gpu/target.cpp
+5
-2
No files found.
src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
View file @
3a4d36cf
...
...
@@ -163,7 +163,7 @@ constexpr Iterator1 search(Iterator1 first, Iterator1 last, Iterator2 s_first, I
{
return
last
;
}
if
(
!
(
*
it
==
*
s_it
))
if
(
not
(
*
it
==
*
s_it
))
{
break
;
}
...
...
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
View file @
3a4d36cf
...
...
@@ -33,49 +33,95 @@
namespace
migraphx
{
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op) \
template <class U> \
constexpr array& operator op(const array<U, N>& x) \
{ \
for(index_int i = 0; i < N; i++) \
d[i] op x[i]; \
return *this; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
constexpr array& operator op(const U& x) \
{ \
for(index_int i = 0; i < N; i++) \
d[i] op x; \
return *this; \
} \
template <class U> \
friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x[i] binary_op y[i]; \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const array& x, const U& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x[i] binary_op y; \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const U& x, const array& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x binary_op y[i]; \
return z; \
#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op) \
template <class U> \
constexpr array& operator op(const array<U, N>& x) \
{ \
array_detail::array_for_each(*this, x)([](auto& sy, auto sx) { sy op sx; }); \
return *this; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
constexpr array& operator op(const U& x) \
{ \
array_detail::array_for_each (*this)([&](auto& sy) { sy op x; }); \
return *this; \
} \
template <class U> \
friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
array_detail::array_for_each(z, x, y)( \
[&](auto& sz, auto sx, auto sy) { sz = sx binary_op sy; }); \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const array& x, const U& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
array_detail::array_for_each(z, x)([&](auto& sz, auto sx) { sz = sx binary_op y; }); \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const U& x, const array& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
array_detail::array_for_each(z, y)([&](auto& sz, auto sy) { sz = x binary_op sy; }); \
return z; \
}
namespace
array_detail
{
template
<
class
T
>
constexpr
auto
is_vectorizable
()
{
return
not
is_same
<
T
,
bool
>
{}
and
(
is_fundamental
<
T
>
{}
or
is_same
<
T
,
half
>
{});
}
template
<
class
T
>
__device__
auto
&
array2vec
(
T
&
x
)
{
using
value_type
=
typename
T
::
value_type
;
constexpr
auto
size
=
decltype
(
x
.
size
()){};
using
type
=
vec
<
value_type
,
size
>
;
if
constexpr
(
is_const
<
T
>
{})
return
reinterpret_cast
<
const
type
&>
(
x
);
else
return
reinterpret_cast
<
type
&>
(
x
);
}
template
<
class
T
,
class
...
Ts
>
constexpr
auto
array_for_each
(
T
&
x
,
Ts
&
...
xs
)
{
MIGRAPHX_ASSERT
(((
x
.
size
()
==
xs
.
size
())
and
...));
return
[
&
](
auto
f
)
{
constexpr
auto
size
=
decltype
(
x
.
size
()){};
if
constexpr
((
is_vectorizable
<
typename
T
::
value_type
>
()
or
(
is_vectorizable
<
typename
Ts
::
value_type
>
()
or
...))
and
size
<=
8
and
size
>
1
and
(
size
%
2
==
0
))
{
if
(
__builtin_is_constant_evaluated
())
{
for
(
index_int
i
=
0
;
i
<
size
;
i
++
)
f
(
x
[
i
],
xs
[
i
]...);
}
else
{
using
vec_type
=
std
::
remove_reference_t
<
decltype
(
array2vec
(
x
))
>
;
f
(
array2vec
(
x
),
__builtin_convertvector
(
array2vec
(
xs
),
vec_type
)...);
}
}
else
{
for
(
index_int
i
=
0
;
i
<
size
;
i
++
)
f
(
x
[
i
],
xs
[
i
]...);
}
};
}
}
// namespace array_detail
template
<
class
T
,
index_int
N
>
struct
array
{
using
value_type
=
T
;
T
d
[
N
];
constexpr
T
&
operator
[](
index_int
i
)
{
...
...
@@ -108,18 +154,13 @@ struct array
constexpr
T
dot
(
const
array
&
x
)
const
{
T
result
=
0
;
for
(
index_int
i
=
0
;
i
<
N
;
i
++
)
result
+=
x
[
i
]
*
d
[
i
];
return
result
;
auto
r
=
x
*
(
*
this
);
return
r
.
reduce
([](
auto
a
,
auto
b
)
{
return
a
+
b
;
},
0
);
}
constexpr
T
product
()
const
{
T
result
=
1
;
for
(
index_int
i
=
0
;
i
<
N
;
i
++
)
result
*=
d
[
i
];
return
result
;
return
reduce
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
1
);
}
constexpr
T
single
(
index_int
width
=
100
)
const
...
...
@@ -134,6 +175,24 @@ struct array
return
result
;
}
template
<
class
F
>
constexpr
auto
apply
(
F
f
)
const
{
array
<
decltype
(
f
(
d
[
0
])),
N
>
result
;
for
(
index_int
i
=
0
;
i
<
N
;
i
++
)
result
[
i
]
=
f
(
d
[
i
]);
return
result
;
}
template
<
class
F
>
constexpr
auto
reduce
(
F
f
,
T
init
)
const
{
T
result
=
init
;
for
(
index_int
i
=
0
;
i
<
N
;
i
++
)
result
=
f
(
result
,
d
[
i
]);
return
result
;
}
MIGRAPHX_DEVICE_ARRAY_OP
(
+=
,
+
)
MIGRAPHX_DEVICE_ARRAY_OP
(
-=
,
-
)
MIGRAPHX_DEVICE_ARRAY_OP
(
*=
,
*
)
...
...
@@ -153,7 +212,7 @@ struct array
return
true
;
}
friend
constexpr
bool
operator
!=
(
const
array
&
x
,
const
array
&
y
)
{
return
!
(
x
==
y
);
}
friend
constexpr
bool
operator
!=
(
const
array
&
x
,
const
array
&
y
)
{
return
not
(
x
==
y
);
}
// This uses the product order rather than lexical order
friend
constexpr
bool
operator
<
(
const
array
&
x
,
const
array
&
y
)
{
...
...
@@ -201,6 +260,11 @@ struct array
}
};
template
<
class
T
,
class
...
Ts
>
constexpr
array
<
T
,
sizeof
...(
Ts
)
+
1
>
make_array
(
T
x
,
Ts
...
xs
)
{
return
{
x
,
static_cast
<
T
>
(
xs
)...};
}
template
<
class
T
,
T
...
Xs
>
struct
integral_const_array
:
array
<
T
,
sizeof
...(
Xs
)
>
{
...
...
src/targets/gpu/include/migraphx/
gpu/max
.hpp
→
src/targets/gpu/
kernels/
include/migraphx/
kernels/concat
.hpp
View file @
3a4d36cf
...
...
@@ -21,22 +21,46 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_MAX_HPP
#define MIGRAPHX_GUARD_RTGLIB_MAX_HPP
#include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/device/max.hpp>
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#ifndef MIGRAPHX_GUARD_KERNELS_CONCAT_HPP
#define MIGRAPHX_GUARD_KERNELS_CONCAT_HPP
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
struct
hip_max
:
binary_device
<
hip_max
,
device
::
max
>
template
<
index_int
Axis
,
class
Output
,
class
Input
,
class
Start
>
constexpr
auto
concat_slice
(
Output
out
,
Input
,
Start
)
{
};
constexpr
auto
lens
=
get_shape_c
<
Input
>
{}.
lens
;
constexpr
auto
strides
=
get_shape_c
<
Output
>
{}.
strides
;
constexpr
auto
offset
=
return_c
([]
{
constexpr
auto
output_shape
=
get_shape_c
<
Output
>
{};
return
Start
{}
*
output_shape
.
strides
[
Axis
];
});
constexpr
auto
s
=
make_shape
(
lens
,
strides
);
return
make_tensor_view
(
&
out
[
offset
],
s
);
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
template
<
index_int
Axis
,
class
Input
>
constexpr
auto
concat_ends
(
Input
)
{
constexpr
auto
lens
=
get_shape_c
<
Input
>
{}.
lens
;
return
_c
<
lens
[
Axis
]
>
;
}
#endif
template
<
index_int
Axis
,
class
Output
,
class
...
Inputs
>
__device__
void
concat
(
Output
output
,
Inputs
...
inputs
)
{
auto
idx
=
make_index
();
fold
([
&
](
auto
start
,
auto
input
)
{
auto
y
=
concat_slice
<
Axis
>
(
output
,
input
,
start
);
idx
.
global_stride
(
input
.
get_shape
().
elements
(),
[
&
](
auto
i
)
{
y
[
i
]
=
input
[
i
];
});
return
start
+
concat_ends
<
Axis
>
(
input
);
})(
_c
<
0
>
,
inputs
...);
}
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_CONCAT_HPP
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
View file @
3a4d36cf
...
...
@@ -31,8 +31,9 @@
->decltype(__VA_ARGS__) { return __VA_ARGS__; }
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT(...) \
[](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
#define MIGRAPHX_LIFT(...) \
[](auto&&... private_lisft_xs) MIGRAPHX_RETURNS( \
(__VA_ARGS__)(static_cast<decltype(private_lisft_xs)>(private_lisft_xs)...))
namespace
migraphx
{
...
...
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
View file @
3a4d36cf
...
...
@@ -28,9 +28,60 @@
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/type_traits.hpp>
#include <migraphx/kernels/debug.hpp>
namespace
migraphx
{
#if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL)
#define MIGRAPHX_NGROUP ((MIGRAPHX_NGLOBAL + MIGRAPHX_NLOCAL - 1) / MIGRAPHX_NLOCAL)
#endif
inline
__device__
__attribute__
((
const
))
index_int
compute_global_size
()
{
#ifdef MIGRAPHX_NGLOBAL
return
MIGRAPHX_NGLOBAL
;
#else
// This actualy works even when global is not divisible by local size.
// This doesnt actually do a multiplicatiosn. Instead it calls a device
// function to get the global size, which is why it works.
return
blockDim
.
x
*
gridDim
.
x
;
// NOLINT
#endif
}
// We cant just use blockDim.x to get the local size since its broken on hip
// when global is not divisible by local size. In this case, we calulate the
// size for the last group.
inline
__device__
__attribute__
((
const
))
index_int
compute_local_size
()
{
#ifdef MIGRAPHX_NLOCAL
const
auto
nlocal
=
MIGRAPHX_NLOCAL
;
#else
const
auto
nlocal
=
blockDim
.
x
;
// NOLINT
#endif
#ifdef MIGRAPHX_NGROUP
const
auto
ngroup
=
MIGRAPHX_NGROUP
;
#else
const
auto
ngroup
=
gridDim
.
x
;
// NOLINT
#endif
const
auto
group_id
=
blockIdx
.
x
;
// NOLINT
const
auto
nglobal
=
compute_global_size
();
if
(
group_id
==
ngroup
-
1
)
{
return
1
+
(
nglobal
-
1
)
%
nlocal
;
}
else
{
return
nlocal
;
// NOLINT
}
}
#ifdef MIGRAPHX_NGROUP
// If global is divisible by local then local can be a const
#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
#define MIGRAPHX_HAS_CONST_LOCAL 1
#endif
#endif
struct
index
{
index_int
global
=
0
;
...
...
@@ -38,20 +89,44 @@ struct index
index_int
group
=
0
;
#ifdef MIGRAPHX_NGLOBAL
constexpr
index_constant
<
MIGRAPHX_NGLOBAL
>
nglobal
()
const
{
return
{};
}
constexpr
index_constant
<
MIGRAPHX_NGLOBAL
>
nglobal
()
const
{
static_assert
(
MIGRAPHX_NGLOBAL
>
0
,
"Global size must be greater than 0"
);
return
{};
}
#else
__device__
index_int
nglobal
()
const
{
return
blockDim
.
x
*
gridDim
.
x
;
// NOLINT
MIGRAPHX_ASSERT
(
compute_global_size
()
>
0
);
return
compute_global_size
();
// NOLINT
}
#endif
#ifdef MIGRAPHX_NLOCAL
constexpr
index_constant
<
MIGRAPHX_NLOCAL
>
nlocal
()
const
{
return
{};
}
#ifdef MIGRAPHX_HAS_CONST_LOCAL
constexpr
index_constant
<
MIGRAPHX_NLOCAL
>
nlocal
()
const
{
static_assert
(
MIGRAPHX_NLOCAL
>
0
,
"Local size must be greater than 0"
);
return
{};
}
#else
__device__
index_int
nlocal
()
const
{
return
blockDim
.
x
;
// NOLINT
#ifdef MIGRAPHX_NGROUP
static_assert
((
MIGRAPHX_NGLOBAL
%
MIGRAPHX_NLOCAL
!=
0
)
and
(
MIGRAPHX_NGROUP
>
1
),
"Local size should be const"
);
#endif
MIGRAPHX_ASSERT
(
compute_local_size
()
>
0
);
return
compute_local_size
();
// NOLINT
}
#endif
#ifdef MIGRAPHX_NLOCAL
constexpr
index_constant
<
MIGRAPHX_NLOCAL
>
max_nlocal
()
const
{
return
{};
}
#else
__device__
index_int
max_nlocal
()
const
{
MIGRAPHX_ASSERT
(
blockDim
.
x
>
0
);
return
blockDim
.
x
;
}
#endif
template
<
class
N
,
class
Stride
>
...
...
@@ -63,6 +138,7 @@ struct index
template
<
class
F
,
class
N
,
class
Stride
>
static
constexpr
void
for_stride
(
index_int
start
,
N
n
,
Stride
stride
,
F
f
)
{
MIGRAPHX_ASSERT
(
start
<
stride
);
if
constexpr
(
not
is_integral
<
N
>
{}
and
not
is_integral
<
Stride
>
{}
and
max_stride_iterations
(
n
,
stride
)
==
1
)
{
...
...
src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
View file @
3a4d36cf
...
...
@@ -73,10 +73,10 @@ MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(!=)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP
(
&
)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP
(
^
)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP
(
|
)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP
(
&&
)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP
(
||
)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP
(
and
)
MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP
(
or
)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP
(
!
)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP
(
not
)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP
(
~
)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP
(
+
)
MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP
(
-
)
...
...
src/targets/gpu/
device/add_sigmoid.c
pp
→
src/targets/gpu/
kernels/include/migraphx/kernels/layernorm.h
pp
View file @
3a4d36cf
...
...
@@ -21,35 +21,69 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/gpu/device/add_sigmoid.hpp>
#include <migraphx/gpu/device/nary.hpp>
#ifndef MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
#define MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
#include <migraphx/kernels/reduce.hpp>
#include <migraphx/kernels/ops.hpp>
#include <migraphx/kernels/print.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
add_sigmoid
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
)
template
<
class
T
,
index_int
N
,
class
Op
>
constexpr
auto
vec_reduce
(
const
array
<
T
,
N
>&
a
,
Op
op
)
{
nary
(
stream
,
result
,
arg1
,
arg2
)(
[](
auto
x
,
auto
y
)
__device__
{
return
1.
f
/
(
1.
f
+
::
exp
(
to_hip_type
(
-
(
x
+
y
))));
});
return
a
.
apply
([
&
](
auto
x
)
{
return
vec_reduce
(
x
,
op
);
});
}
void
add_sigmoid
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg1
,
const
argument
&
arg2
,
const
argument
&
arg3
)
template
<
index_int
Axis
,
class
F
,
class
BinOp
,
class
Output
,
class
Input1
,
class
Input2
,
class
...
Inputs
>
__device__
void
generic_binary_layernorm
(
F
compute
,
BinOp
op
,
float
eps
,
Output
output
,
Input1
input1
,
Input2
input2
,
Inputs
...
inputs
)
{
nary
(
stream
,
result
,
arg1
,
arg2
,
arg3
)([](
auto
x
,
auto
y
,
auto
z
)
__device__
{
return
1.
f
/
(
1.
f
+
::
exp
(
to_hip_type
(
-
(
x
+
y
+
z
))));
using
reduce_output
=
reduce
::
with_axis
<
Input1
,
Axis
>
;
reduce
::
block
::
run
<
reduce_output
>
([
&
](
auto
,
auto
r
)
{
using
value_type
=
typename
Input1
::
type
;
constexpr
auto
relements
=
r
.
template
elements
<
Input1
>();
auto
means
=
r
.
reduce
(
op
::
sum
{},
make_array
<
vec_type
<
value_type
>>
(
0
,
0
),
[
&
](
auto
x1
,
auto
x2
)
{
auto
x
=
op
(
x1
,
x2
);
return
make_array
(
x
,
x
*
x
)
*
vec_type
<
value_type
>
{
1.0
/
relements
};
})(
input1
,
input2
);
auto
mean_x
=
means
[
0
];
auto
mean_x2
=
means
[
1
];
auto
variance
=
mean_x2
-
(
mean_x
*
mean_x
);
value_type
eps_val
=
eps
;
// implicit conversion for eps
r
.
inner
([
&
](
auto
&
y
,
auto
x1
,
auto
x2
,
auto
...
xs
)
{
auto
x
=
op
(
x1
,
x2
);
auto
m
=
x
-
mean_x
;
// m * rsqrt(mean(m ^ 2) + epsilon)
y
=
compute
(
m
*
rsqrt
(
variance
+
eps_val
),
xs
...);
})(
output
,
input1
,
input2
,
inputs
...);
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
template
<
index_int
Axis
,
class
F
,
class
Output
,
class
Input
,
class
...
Inputs
>
__device__
void
layernorm
(
F
compute
,
float
eps
,
Output
output
,
Input
input
,
Inputs
...
inputs
)
{
generic_binary_layernorm
<
Axis
>
(
compute
,
[](
auto
x
,
auto
)
{
return
x
;
},
eps
,
output
,
input
,
input
,
inputs
...);
}
template
<
index_int
Axis
,
class
F
,
class
Output
,
class
Input1
,
class
Input2
,
class
...
Inputs
>
__device__
void
add_layernorm
(
F
compute
,
float
eps
,
Output
output
,
Input1
input1
,
Input2
input2
,
Inputs
...
inputs
)
{
generic_binary_layernorm
<
Axis
>
(
compute
,
[](
auto
x1
,
auto
x2
)
{
return
x1
+
x2
;
},
eps
,
output
,
input1
,
input2
,
inputs
...);
}
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
View file @
3a4d36cf
...
...
@@ -104,6 +104,7 @@ MIGRAPHX_DEVICE_MATH(floor, ::floor)
MIGRAPHX_DEVICE_MATH
(
isnan
,
::
isnan
)
MIGRAPHX_DEVICE_MATH
(
log
,
::
log
)
MIGRAPHX_DEVICE_MATH
(
pow
,
::
pow
)
MIGRAPHX_DEVICE_MATH
(
remainder
,
::
remainder
)
MIGRAPHX_DEVICE_MATH
(
round
,
::
round
)
MIGRAPHX_DEVICE_MATH
(
rsqrt
,
::
rsqrt
)
MIGRAPHX_DEVICE_MATH
(
sin
,
::
sin
)
...
...
@@ -111,6 +112,7 @@ MIGRAPHX_DEVICE_MATH(sinh, ::sinh)
MIGRAPHX_DEVICE_MATH
(
sqrt
,
::
sqrt
)
MIGRAPHX_DEVICE_MATH
(
tan
,
::
tan
)
MIGRAPHX_DEVICE_MATH
(
tanh
,
::
tanh
)
MIGRAPHX_DEVICE_MATH
(
fmod
,
::
fmod
)
// Float overloads
MIGRAPHX_DEVICE_MATH_FOR
(
float
,
acos
,
::
acosf
)
...
...
@@ -126,6 +128,7 @@ MIGRAPHX_DEVICE_MATH_FOR(float, sin, ::sinf)
MIGRAPHX_DEVICE_MATH_FOR
(
float
,
sinh
,
::
sinhf
)
MIGRAPHX_DEVICE_MATH_FOR
(
float
,
tan
,
::
tanf
)
MIGRAPHX_DEVICE_MATH_FOR
(
float
,
tanh
,
::
tanhf
)
MIGRAPHX_DEVICE_MATH_FOR
(
float
,
fmod
,
::
fmodf
)
// Builtin half functions
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
abs
,
::
__habs
)
...
...
@@ -148,11 +151,13 @@ MIGRAPHX_DEVICE_MATH_HALF(erf, ::erf)
MIGRAPHX_DEVICE_MATH_HALF
(
floor
,
::
floor
)
MIGRAPHX_DEVICE_MATH_HALF
(
isnan
,
::
isnan
)
MIGRAPHX_DEVICE_MATH_HALF
(
pow
,
::
pow
)
MIGRAPHX_DEVICE_MATH_HALF
(
remainder
,
::
remainder
)
MIGRAPHX_DEVICE_MATH_HALF
(
round
,
::
round
)
MIGRAPHX_DEVICE_MATH_HALF
(
sin
,
::
sin
)
MIGRAPHX_DEVICE_MATH_HALF
(
sinh
,
::
sinh
)
MIGRAPHX_DEVICE_MATH_HALF
(
tan
,
::
tan
)
MIGRAPHX_DEVICE_MATH_HALF
(
tanh
,
::
tanh
)
MIGRAPHX_DEVICE_MATH_HALF
(
fmod
,
::
fmod
)
// Map math functions to hip half2 functions
// The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
...
...
@@ -226,11 +231,13 @@ MIGRAPHX_DEVICE_MATH_VEC(cosh)
MIGRAPHX_DEVICE_MATH_VEC
(
erf
)
MIGRAPHX_DEVICE_MATH_VEC
(
exp
)
MIGRAPHX_DEVICE_MATH_VEC
(
floor
)
MIGRAPHX_DEVICE_MATH_VEC
(
fmod
)
MIGRAPHX_DEVICE_MATH_VEC
(
isnan
)
MIGRAPHX_DEVICE_MATH_VEC
(
log
)
MIGRAPHX_DEVICE_MATH_VEC
(
max
)
MIGRAPHX_DEVICE_MATH_VEC
(
min
)
MIGRAPHX_DEVICE_MATH_VEC
(
pow
)
MIGRAPHX_DEVICE_MATH_VEC
(
remainder
)
MIGRAPHX_DEVICE_MATH_VEC
(
round
)
MIGRAPHX_DEVICE_MATH_VEC
(
rsqrt
)
MIGRAPHX_DEVICE_MATH_VEC
(
sin
)
...
...
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
View file @
3a4d36cf
...
...
@@ -90,7 +90,7 @@ struct lowest
template
<
class
T
>
constexpr
operator
T
()
const
{
return
numeric_lowest
<
T
>
();
return
numeric_lowest
<
vec_type
<
T
>
>
();
}
};
...
...
@@ -99,7 +99,7 @@ struct highest
template
<
class
T
>
constexpr
operator
T
()
const
{
return
numeric_max
<
T
>
();
return
numeric_max
<
vec_type
<
T
>
>
();
}
};
}
// namespace migraphx
...
...
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
View file @
3a4d36cf
...
...
@@ -94,16 +94,17 @@ MIGRAPHX_DPP_REDUCE(op::max, v_max)
MIGRAPHX_DPP_REDUCE
(
op
::
min
,
v_min
)
MIGRAPHX_DPP_REDUCE
(
op
::
product
,
v_mul
)
template
<
class
Op
,
class
T
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
i
ndex
_int
n
,
F
f
)
template
<
class
Op
,
class
T
,
class
Index
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
I
ndex
n
,
F
f
)
{
MIGRAPHX_ASSERT
(
idx
.
max_nlocal
()
==
idx
.
nlocal
());
#if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr
index_int
lanes_per_thread
=
16
;
#else
constexpr
index_int
lanes_per_thread
=
64
;
#endif
using
type
=
decltype
(
f
(
0
));
__shared__
type
buffer
[
idx
.
nlocal
()
/
lanes_per_thread
];
__shared__
type
buffer
[
idx
.
max_
nlocal
()
/
lanes_per_thread
];
type
x
=
init
;
idx
.
local_stride
(
n
,
[
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
dpp_reduce
(
x
,
op
);
...
...
@@ -123,12 +124,12 @@ __device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
return
y
;
}
#else
template
<
class
Op
,
class
T
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
i
ndex
_int
n
,
F
f
)
template
<
class
Op
,
class
T
,
class
Index
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
I
ndex
n
,
F
f
)
{
MIGRAPHX_ASSERT
(
idx
.
max_nlocal
()
==
idx
.
nlocal
());
using
type
=
decltype
(
f
(
0
));
__shared__
type
buffer
[
idx
.
nlocal
()];
__shared__
type
buffer
[
idx
.
max_
nlocal
()];
type
x
=
init
;
idx
.
local_stride
(
n
,
[
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
buffer
[
idx
.
local
]
=
x
;
...
...
@@ -201,12 +202,9 @@ struct block
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
{
return
sliced
(
slicer
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
vec_reduce
(
block_reduce
(
idx
,
op
,
init
,
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
return
read
(
x
[
j
],
xs
[
j
]...);
}),
op
);
return
block_reduce
(
idx
,
op
,
init
,
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
return
vec_reduce
(
read
(
x
[
j
],
xs
[
j
]...),
op
);
});
});
}
...
...
@@ -224,6 +222,18 @@ struct block
idx
.
local_stride
(
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
f
(
x
[
j
],
xs
[
j
]...);
});
});
}
template
<
class
Input
>
constexpr
auto
elements
()
const
{
using
reduce_type
=
decltype
(
slicer
(
Input
{}));
using
value_type
=
typename
Input
::
type
;
constexpr
auto
relements
=
get_shape_c
<
reduce_type
>
{}.
elements
();
if
constexpr
(
vec_size
<
value_type
>
()
>
1
)
return
relements
*
vec_size
<
value_type
>
();
else
return
relements
;
}
};
template
<
class
Slicer
>
...
...
@@ -281,6 +291,13 @@ struct lane
}
});
}
template
<
class
Input
>
constexpr
auto
elements
()
const
{
using
reduce_type
=
decltype
(
slicer
(
Input
{}));
return
get_shape_c
<
reduce_type
>
{}.
elements
();
}
};
template
<
class
Slicer
>
...
...
src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
View file @
3a4d36cf
...
...
@@ -192,9 +192,13 @@ struct common_type<T, U, Us...>
template
<
class
...
Ts
>
using
common_type_t
=
typename
common_type
<
Ts
...
>::
type
;
#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
constexpr
unsigned
long
int_max
(
unsigned
long
n
)
{
return
(
1u
<<
(
n
*
8
))
-
1
;
}
template
<
class
T
>
template
<
class
T
,
MIGRAPHX_REQUIRES
(
is_integral
<
T
>{}
or
is_floating_point
<
T
>
{}
or
is_same
<
T
,
migraphx
::
half
>
{})
>
constexpr
T
numeric_max
()
{
if
constexpr
(
is_integral
<
T
>
{})
...
...
@@ -230,8 +234,6 @@ constexpr T numeric_lowest()
}
}
#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
}
// namespace migraphx
#endif
src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
View file @
3a4d36cf
...
...
@@ -175,7 +175,7 @@ template <class T, class Op>
constexpr
auto
vec_reduce
(
T
x
,
Op
op
)
{
if
constexpr
(
vec_size
<
T
>
()
<
2
)
return
x
;
return
vec_type
<
T
>
{
x
}
;
else
{
vec_type
<
T
>
result
=
x
[
0
];
...
...
src/targets/gpu/lowering.cpp
View file @
3a4d36cf
...
...
@@ -27,42 +27,24 @@
#include <migraphx/instruction.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/op/abs.hpp>
#include <migraphx/op/batch_norm_inference.hpp>
#include <migraphx/op/convolution.hpp>
#include <migraphx/op/deconvolution.hpp>
#include <migraphx/op/dot.hpp>
#include <migraphx/op/elu.hpp>
#include <migraphx/op/if_op.hpp>
#include <migraphx/op/leaky_relu.hpp>
#include <migraphx/op/lrn.hpp>
#include <migraphx/op/pooling.hpp>
#include <migraphx/op/reshape.hpp>
#include <migraphx/op/quant_convolution.hpp>
#include <migraphx/op/quant_dot.hpp>
#include <migraphx/gpu/abs.hpp>
#include <migraphx/gpu/batch_norm_inference.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/convolution.hpp>
#include <migraphx/gpu/deconvolution.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/elu.hpp>
#include <migraphx/gpu/equal.hpp>
#include <migraphx/gpu/gemm.hpp>
#include <migraphx/gpu/greater.hpp>
#include <migraphx/gpu/int8_conv_pack.hpp>
#include <migraphx/gpu/leaky_relu.hpp>
#include <migraphx/gpu/less.hpp>
#include <migraphx/gpu/logical_and.hpp>
#include <migraphx/gpu/logical_or.hpp>
#include <migraphx/gpu/logical_xor.hpp>
#include <migraphx/gpu/lrn.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/quant_convolution.hpp>
#include <migraphx/gpu/rocblas.hpp>
#include <migraphx/gpu/unary_not.hpp>
#include <migraphx/gpu/where.hpp>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/program.hpp>
...
...
@@ -99,78 +81,21 @@ struct miopen_apply
(
void
)
i
;
}
const
std
::
unordered_set
<
std
::
string
>&
get_rocblas_fp32_archs
()
{
static
std
::
unordered_set
<
std
::
string
>
supported_archs
{
"gfx908"
,
"gfx90a"
};
return
supported_archs
;
}
void
init
()
{
assert
(
mod
!=
nullptr
);
assert
(
pass
!=
nullptr
);
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
auto
&
ctx
=
get_context
();
const
auto
device_name
=
trim
(
split_string
(
get_device_name
(),
':'
).
front
());
if
(
contains
(
get_rocblas_fp32_archs
(),
device_name
))
compute_fp32
=
true
;
rocblas_gemm_flags
flag
;
rocblas_query_int8_layout_flag
(
ctx
.
get_stream
().
get_rocblas
(),
&
flag
);
int8_x4_format
=
(
flag
==
rocblas_gemm_flags_pack_int8x4
);
#endif
auto
&
ctx
=
get_context
();
int8_x4_format
=
get_int8_x4_format
(
ctx
);
compute_fp32
=
get_compute_fp32_flag
();
offload_copy
=
(
mod
->
name
()
==
"main"
)
?
pass
->
offload_copy
:
false
;
add_generic_op
(
"acos"
);
add_generic_op
(
"acosh"
);
add_generic_op
(
"add"
);
add_generic_op
(
"asin"
);
add_generic_op
(
"asinh"
);
add_generic_op
(
"atan"
);
add_generic_op
(
"atanh"
);
add_generic_op
(
"ceil"
);
add_generic_op
(
"contiguous"
);
add_generic_op
(
"cos"
);
add_generic_op
(
"cosh"
);
add_generic_op
(
"div"
);
add_generic_op
(
"equal"
);
add_generic_op
(
"erf"
);
add_generic_op
(
"exp"
);
add_generic_op
(
"floor"
);
add_generic_op
(
"greater"
);
add_generic_op
(
"less"
);
add_generic_op
(
"log"
);
add_generic_op
(
"logical_and"
);
add_generic_op
(
"logical_or"
);
add_generic_op
(
"logical_xor"
);
add_generic_op
(
"max"
);
add_generic_op
(
"min"
);
add_generic_op
(
"mul"
);
add_generic_op
(
"not"
);
add_generic_op
(
"pow"
);
add_generic_op
(
"prelu"
);
add_generic_op
(
"recip"
);
add_generic_op
(
"relu"
);
add_generic_op
(
"round"
);
add_generic_op
(
"rsqrt"
);
add_generic_op
(
"sigmoid"
);
add_generic_op
(
"sign"
);
add_generic_op
(
"sin"
);
add_generic_op
(
"sinh"
);
add_generic_op
(
"sqdiff"
);
add_generic_op
(
"sqrt"
);
add_generic_op
(
"sub"
);
add_generic_op
(
"tan"
);
add_generic_op
(
"tanh"
);
add_generic_op
(
"where"
);
add_extend_op
(
"abs"
);
add_extend_op
(
"argmax"
);
add_extend_op
(
"argmin"
);
add_extend_op
(
"clip"
);
add_extend_op
(
"concat"
);
add_extend_op
(
"convert"
);
add_extend_op
(
"elu"
);
add_extend_op
(
"gather"
);
add_extend_op
(
"leaky_relu"
);
...
...
@@ -341,7 +266,7 @@ struct miopen_apply
catch
(
migraphx
::
exception
&
)
{
// In case no solver supports the default format, retry using the other format.
compile_quant_conv_with_format
(
!
int8_x4_format
);
compile_quant_conv_with_format
(
not
int8_x4_format
);
}
auto
args
=
ins
->
inputs
();
...
...
src/targets/gpu/mlir.cpp
View file @
3a4d36cf
...
...
@@ -78,7 +78,7 @@ struct mlir_handle
friend
bool
operator
==
(
ptr
x
,
ptr
y
)
{
return
x
.
get_value
()
==
y
.
get_value
();
}
friend
bool
operator
!=
(
ptr
x
,
ptr
y
)
{
return
!
(
x
==
y
);
}
friend
bool
operator
!=
(
ptr
x
,
ptr
y
)
{
return
not
(
x
==
y
);
}
T
obj
{};
};
...
...
@@ -503,7 +503,7 @@ struct mlir_program
pp
=
problem_params
{
ins
->
get_operator
(),
to_shapes
(
ins
->
inputs
()),
ins
->
get_shape
()};
std
::
string
tuned
=
get_tune_params
();
if
(
!
tuned
.
empty
())
if
(
not
tuned
.
empty
())
ops
.
add_attributes
({{
"perf_config"
,
tuned
}});
// check if HW supports xdlops
if
(
contains
(
get_xdlops_archs
(),
target_name
))
...
...
src/targets/gpu/pack_int8_args.cpp
View file @
3a4d36cf
...
...
@@ -154,7 +154,7 @@ void pack_int8_args::apply(module& m) const
bool
transa
=
inputs
[
0
]
->
get_shape
().
transposed
();
bool
transb
=
inputs
[
1
]
->
get_shape
().
transposed
();
if
(
!
transb
)
if
(
not
transb
)
{
auto
packed_b
=
m
.
insert_instruction
(
ins
,
make_op
(
"hip::allocate"
,
{{
"shape"
,
to_value
(
inputs
[
1
]
->
get_shape
())}}));
...
...
src/targets/gpu/prefuse_ops.cpp
View file @
3a4d36cf
...
...
@@ -23,13 +23,62 @@
*/
#include <migraphx/gpu/prefuse_ops.hpp>
#include <migraphx/match/layernorm.hpp>
#include <migraphx/check_shapes.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/register_op.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
{
template
<
class
Derived
,
std
::
size_t
N
>
struct
layernorm_base
{
float
epsilon
=
1e-12
f
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
pack
(
f
(
self
.
epsilon
,
"epsilon"
));
}
shape
compute_shape
(
std
::
vector
<
shape
>
inputs
,
std
::
vector
<
module_ref
>
mods
)
const
{
std
::
size_t
nargs
=
1
;
if
(
not
mods
.
empty
())
{
auto
*
pm
=
mods
.
front
();
nargs
=
pm
->
get_parameter_names
().
size
();
}
check_shapes
{
inputs
,
static_cast
<
const
Derived
&>
(
*
this
)}.
has
(
nargs
+
N
);
auto
s
=
inputs
.
at
(
0
);
if
(
s
.
scalar
())
{
return
s
;
}
else
if
(
s
.
broadcasted
())
{
return
{
s
.
type
(),
s
.
lens
()};
}
else
{
return
s
.
with_lens
(
s
.
lens
());
}
}
};
struct
layernorm
:
layernorm_base
<
layernorm
,
0
>
{
std
::
string
name
()
const
{
return
"gpu::prelayernorm"
;
}
};
MIGRAPHX_REGISTER_OP
(
layernorm
);
struct
add_layernorm
:
layernorm_base
<
add_layernorm
,
1
>
{
std
::
string
name
()
const
{
return
"gpu::preadd_layernorm"
;
}
};
MIGRAPHX_REGISTER_OP
(
add_layernorm
);
struct
find_layernorm
{
auto
matcher
()
const
{
return
match
::
layernorm
();
}
...
...
@@ -38,60 +87,33 @@ struct find_layernorm
{
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"x"
];
auto
eps
=
r
.
instructions
[
"eps"
]
->
eval
().
at
<
float
>
();
if
(
not
x_ins
->
get_shape
().
standard
())
x_ins
=
m
.
insert_instruction
(
ins
,
make_op
(
"contiguous"
),
x_ins
);
auto
relements
=
x_ins
->
get_shape
().
lens
().
back
();
if
(
relements
>
1024
or
(
relements
%
4
!=
0
and
relements
>
256
))
return
;
auto
a
=
m
.
insert_instruction
(
ins
,
make_op
(
"hip::allocate"
,
{{
"shape"
,
to_value
(
x_ins
->
get_shape
())}}));
m
.
replace_instruction
(
ins
,
make_op
(
"gpu::layernorm"
),
x_ins
,
a
);
m
.
replace_instruction
(
ins
,
layernorm
{
eps
},
x_ins
);
}
};
struct
find_
tri
addlayernorm
struct
find_add
_
layernorm
{
auto
matcher
()
const
{
auto
add1
=
match
::
name
(
"add"
)(
match
::
none_of
(
match
::
is_constant
()),
match
::
args
(
match
::
any
().
bind
(
"z1"
),
match
::
any
().
bind
(
"z2"
)));
auto
add2
=
match
::
name
(
"add"
)(
match
::
either_arg
(
0
,
1
)(
add1
,
match
::
any
().
bind
(
"z3"
)));
return
match
::
layernorm
()(
match
::
var
(
"x"
)(
add2
));
return
match
::
layernorm
()(
match
::
var
(
"x"
)(
match
::
name
(
"add"
).
bind
(
"add"
)));
}
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
r
)
const
{
auto
ins
=
r
.
result
;
auto
x_ins
=
r
.
instructions
[
"z1"
];
auto
y_ins
=
r
.
instructions
[
"z2"
];
auto
z_ins
=
r
.
instructions
[
"z3"
];
for
(
auto
*
pins
:
{
&
x_ins
,
&
y_ins
,
&
z_ins
})
{
if
(
not
(
*
pins
)
->
get_shape
().
standard
())
*
pins
=
m
.
insert_instruction
(
ins
,
make_op
(
"contiguous"
),
*
pins
);
}
auto
relements
=
x_ins
->
get_shape
().
lens
().
back
();
if
(
relements
>
1024
or
(
relements
%
4
!=
0
and
relements
>
256
))
return
;
auto
ins
=
r
.
result
;
auto
add_ins
=
r
.
instructions
[
"add"
];
auto
eps
=
r
.
instructions
[
"eps"
]
->
eval
().
at
<
float
>
();
auto
a
=
m
.
insert_instruction
(
ins
,
make_op
(
"hip::allocate"
,
{{
"shape"
,
to_value
(
x_ins
->
get_shape
())}}));
m
.
replace_instruction
(
ins
,
make_op
(
"gpu::triadd_layernorm"
),
x_ins
,
y_ins
,
z_ins
,
a
);
m
.
replace_instruction
(
ins
,
add_layernorm
{
eps
},
add_ins
->
inputs
());
}
};
}
// namespace
void
prefuse_ops
::
apply
(
module
&
m
)
const
{
match
::
find_matches
(
m
,
find_
tri
addlayernorm
{},
find_layernorm
{});
match
::
find_matches
(
m
,
find_add
_
layernorm
{},
find_layernorm
{});
}
}
// namespace gpu
...
...
src/targets/gpu/quant_convolution.cpp
View file @
3a4d36cf
...
...
@@ -22,7 +22,6 @@
* THE SOFTWARE.
*/
#include <migraphx/gpu/quant_convolution.hpp>
#include <migraphx/gpu/device/convert.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/generate.hpp>
...
...
src/targets/gpu/rocblas.cpp
View file @
3a4d36cf
...
...
@@ -21,7 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <unordered_set>
#include <migraphx/ranges.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/rocblas.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -41,6 +47,33 @@ rocblas_handle_ptr create_rocblas_handle_ptr(hipStream_t s)
return
rb
;
}
const
std
::
unordered_set
<
std
::
string
>&
get_rocblas_fp32_archs
()
{
static
std
::
unordered_set
<
std
::
string
>
supported_archs
{
"gfx908"
,
"gfx90a"
};
return
supported_archs
;
}
bool
get_compute_fp32_flag
()
{
bool
compute_fp32
=
false
;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
const
auto
device_name
=
trim
(
split_string
(
get_device_name
(),
':'
).
front
());
if
(
contains
(
get_rocblas_fp32_archs
(),
device_name
))
compute_fp32
=
true
;
#endif
return
compute_fp32
;
}
bool
get_int8_x4_format
(
context
&
ctx
)
{
bool
int8_x4_format
=
true
;
#if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
rocblas_gemm_flags
flag
;
rocblas_query_int8_layout_flag
(
ctx
.
get_stream
().
get_rocblas
(),
&
flag
);
int8_x4_format
=
(
flag
==
rocblas_gemm_flags_pack_int8x4
);
#endif
return
int8_x4_format
;
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/softmax.cpp
deleted
100644 → 0
View file @
6bec381f
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <migraphx/gpu/softmax.hpp>
#include <migraphx/gpu/device/softmax.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/tune_axis.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
shape
hip_softmax
::
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
check_shapes
{
inputs
,
*
this
}.
has
(
2
).
standard
();
return
op
.
normalize_compute_shape
({
inputs
.
at
(
0
)});
}
argument
hip_softmax
::
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
auto
n_dim
=
args
.
front
().
get_shape
().
lens
().
size
();
auto
tuned_axis
=
tune_axis
(
n_dim
,
op
.
axis
,
op
.
name
());
device
::
softmax
(
ctx
.
get_stream
().
get
(),
args
.
back
(),
args
.
front
(),
tuned_axis
);
return
args
.
back
();
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/target.cpp
View file @
3a4d36cf
...
...
@@ -42,6 +42,7 @@
#include <migraphx/register_target.hpp>
#include <migraphx/replace_allocate.hpp>
#include <migraphx/rewrite_batchnorm.hpp>
#include <migraphx/rewrite_gelu.hpp>
#include <migraphx/rewrite_pooling.hpp>
#include <migraphx/rewrite_quantization.hpp>
#include <migraphx/rewrite_rnn.hpp>
...
...
@@ -116,6 +117,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
inline_module
{},
rewrite_pooling
{},
dead_code_elimination
{},
rewrite_gelu
{},
dead_code_elimination
{},
eliminate_common_subexpression
{},
dead_code_elimination
{},
simplify_algebra
{},
...
...
@@ -134,8 +137,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
lowering
{
&
ctx
,
options
.
offload_copy
},
eliminate_contiguous
{
"gpu::contiguous"
},
dead_code_elimination
{},
replace_allocate
{
gpu_allocation_model
{},
options
.
offload_copy
},
dead_code_elimination
{},
eliminate_concat
{
concat_gpu_optimization
{}},
dead_code_elimination
{},
pack_int8_args
{},
...
...
@@ -144,6 +145,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
dead_code_elimination
{},
fuse_ops
{
&
ctx
,
options
.
fast_math
},
dead_code_elimination
{},
replace_allocate
{
gpu_allocation_model
{},
options
.
offload_copy
},
dead_code_elimination
{},
compile_ops
{
&
ctx
},
dead_code_elimination
{},
write_literals
{
&
ctx
},
...
...
Prev
1
…
12
13
14
15
16
17
18
19
20
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment