Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
7e297b13
Commit
7e297b13
authored
Jun 13, 2022
by
Paul
Browse files
Merge
parents
86ea5e91
aa7ff911
Changes
765
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
703 additions
and
150 deletions
+703
-150
src/targets/gpu/compile_pointwise.cpp
src/targets/gpu/compile_pointwise.cpp
+0
-62
src/targets/gpu/compiler.cpp
src/targets/gpu/compiler.cpp
+39
-0
src/targets/gpu/device/fill.cpp
src/targets/gpu/device/fill.cpp
+17
-0
src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
...ts/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+41
-0
src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
...targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+3
-2
src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
...ts/gpu/device/include/migraphx/gpu/device/multi_index.hpp
+4
-3
src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
...targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+15
-9
src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
+13
-6
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+15
-0
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+12
-11
src/targets/gpu/device/layernorm.cpp
src/targets/gpu/device/layernorm.cpp
+15
-4
src/targets/gpu/device/multinomial.cpp
src/targets/gpu/device/multinomial.cpp
+66
-0
src/targets/gpu/device/nonzero.cpp
src/targets/gpu/device/nonzero.cpp
+54
-0
src/targets/gpu/device/prefix_scan_sum.cpp
src/targets/gpu/device/prefix_scan_sum.cpp
+100
-20
src/targets/gpu/device/softmax.cpp
src/targets/gpu/device/softmax.cpp
+48
-24
src/targets/gpu/device/topk.cpp
src/targets/gpu/device/topk.cpp
+216
-0
src/targets/gpu/device/where.cpp
src/targets/gpu/device/where.cpp
+39
-0
src/targets/gpu/driver/CMakeLists.txt
src/targets/gpu/driver/CMakeLists.txt
+2
-6
src/targets/gpu/driver/compile_op.cpp
src/targets/gpu/driver/compile_op.cpp
+3
-3
src/targets/gpu/driver/main.cpp
src/targets/gpu/driver/main.cpp
+1
-0
No files found.
src/targets/gpu/compile_pointwise.cpp
deleted
100755 → 0
View file @
86ea5e91
#include <migraphx/gpu/compile_pointwise.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
static
const
char
*
const
pointwise_kernel
=
R"__migraphx__(
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/pointwise.hpp>
#include <args.hpp>
using namespace migraphx;
extern "C" {
__global__ void kernel(${params})
{
pointwise(${lambda}, ${args});
}
}
int main() {}
)__migraphx__"
;
std
::
string
enum_params
(
std
::
size_t
count
,
std
::
string
param
)
{
std
::
vector
<
std
::
string
>
items
(
count
);
transform
(
range
(
count
),
items
.
begin
(),
[
&
](
auto
i
)
{
return
param
+
std
::
to_string
(
i
);
});
return
join_strings
(
items
,
","
);
}
std
::
size_t
compute_global
(
std
::
size_t
n
,
std
::
size_t
local
=
1024
)
{
std
::
size_t
groups
=
(
n
+
local
-
1
)
/
local
;
std
::
size_t
nglobal
=
std
::
min
<
std
::
size_t
>
(
256
,
groups
)
*
local
;
return
nglobal
;
}
operation
compile_pointwise
(
context
&
,
const
std
::
vector
<
shape
>&
inputs
,
const
std
::
string
&
lambda
)
{
hip_compile_options
options
;
options
.
global
=
compute_global
(
inputs
.
front
().
elements
());
options
.
local
=
1024
;
options
.
inputs
=
inputs
;
options
.
output
=
inputs
.
back
();
options
.
reduced_inputs
=
reduce_dims
(
inputs
);
auto
src
=
interpolate_string
(
pointwise_kernel
,
{{
"params"
,
enum_params
(
inputs
.
size
(),
"void * private_p"
)},
{
"args"
,
enum_params
(
inputs
.
size
(),
"private_p"
)},
{
"lambda"
,
lambda
}});
return
compile_hip_code_object
(
src
,
options
);
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/compiler.cpp
0 → 100644
View file @
7e297b13
#include <migraphx/gpu/compiler.hpp>
#include <utility>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
auto
&
compiler_map
()
{
static
std
::
unordered_map
<
std
::
string
,
compiler_compile
>
m
;
// NOLINT
return
m
;
}
auto
&
compiler_op_map
()
{
static
std
::
unordered_map
<
std
::
string
,
compiler_compile_op
>
m
;
// NOLINT
return
m
;
}
void
register_compiler
(
const
std
::
string
&
name
,
compiler_compile
c
,
compiler_compile_op
cop
)
{
compiler_map
()[
name
]
=
std
::
move
(
c
);
compiler_op_map
()[
name
]
=
std
::
move
(
cop
);
}
bool
has_compiler_for
(
const
std
::
string
&
name
)
{
return
compiler_map
().
count
(
name
)
>
0
;
}
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
{
return
compiler_map
().
at
(
op
.
name
())(
ctx
,
ins
,
op
);
}
operation
compile_op
(
const
std
::
string
&
name
,
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
const
value
&
v
)
{
return
compiler_op_map
().
at
(
name
)(
ctx
,
inputs
,
v
);
}
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/fill.cpp
0 → 100644
View file @
7e297b13
#include <migraphx/gpu/device/fill.hpp>
#include <migraphx/gpu/device/nary.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
void
fill
(
hipStream_t
stream
,
const
argument
&
result
,
unsigned
long
val
)
{
nary
(
stream
,
result
)([
=
]()
__device__
{
return
val
;
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
0 → 100644
View file @
7e297b13
#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
#include <migraphx/requires.hpp>
#include <migraphx/config.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
...
Ts
>
using
common_type
=
typename
std
::
common_type
<
Ts
...
>::
type
;
template
<
class
T
,
MIGRAPHX_REQUIRES
(
is_floating_point
<
T
>{})
>
__device__
bool
float_equal_device
(
T
x
,
T
y
)
{
return
std
::
isfinite
(
x
)
and
std
::
isfinite
(
y
)
and
std
::
nextafter
(
x
,
std
::
numeric_limits
<
T
>::
lowest
())
<=
y
and
std
::
nextafter
(
x
,
std
::
numeric_limits
<
T
>::
max
())
>=
y
;
}
template
<
class
T
,
MIGRAPHX_REQUIRES
(
not
is_floating_point
<
T
>{})
>
__device__
bool
float_equal_device
(
T
x
,
T
y
)
{
return
x
==
y
;
}
template
<
class
T
,
class
U
>
__device__
bool
float_equal
(
T
x
,
U
y
)
{
return
float_equal_device
<
common_type
<
T
,
U
>>
(
x
,
y
);
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
#endif
src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
100755 → 100644
View file @
7e297b13
...
...
@@ -75,8 +75,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto gs_invoke(F&& f, index_int i, index) -> decltype(
inline
auto
gs_launch
(
hipStream_t
stream
,
index_int
n
,
index_int
local
=
1024
)
{
index_int
groups
=
(
n
+
local
-
1
)
/
local
;
index_int
nglobal
=
std
::
min
<
index_int
>
(
256
,
groups
)
*
local
;
index_int
groups
=
(
n
+
local
-
1
)
/
local
;
// max possible number of blocks is set to 1B (1,073,741,824)
index_int
nglobal
=
std
::
min
<
index_int
>
(
1073741824
,
groups
)
*
local
;
return
[
=
](
auto
f
)
{
launch
(
stream
,
nglobal
,
local
)([
=
](
auto
idx
)
__device__
{
...
...
src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
View file @
7e297b13
...
...
@@ -57,9 +57,10 @@ inline auto mi_nglobal(const hip_shape<N>& s, index_int nlocal)
{
assert
(
s
.
standard
);
assert
(
s
.
elements
()
>
0
);
index_int
n
=
s
.
elements
();
index_int
groups
=
(
n
+
nlocal
-
1
)
/
nlocal
;
index_int
nglobal
=
std
::
min
<
index_int
>
(
128
,
groups
)
*
nlocal
;
index_int
n
=
s
.
elements
();
index_int
groups
=
(
n
+
nlocal
-
1
)
/
nlocal
;
// max possible number of blocks is set to 1B (1,073,741,824)
index_int
nglobal
=
std
::
min
<
index_int
>
(
1073741824
,
groups
)
*
nlocal
;
assert
(
groups
>
0
);
assert
(
nglobal
>
0
);
...
...
src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
100755 → 100644
View file @
7e297b13
...
...
@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
device
{
#if __AMDGCN_WAVEFRONT_SIZE == 32
#define MIGRAPHX_NO_DPP
#endif
#ifdef MIGRAPHX_NO_DPP
template
<
index_int
N
,
class
Op
,
...
...
@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_shr
(
8
),
0xf
,
0xc
>
(
in
);
in
=
op
(
in
,
out
);
#if __AMDGCN_WAVEFRONT_SIZE == 64
out
=
dpp_mov
<
dpp_row_bcast
(
15
),
0xa
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_bcast
(
31
),
0xc
>
(
in
);
in
=
op
(
in
,
out
);
#endif
}
__device__
inline
void
dpp_reduce
(
float
&
x
,
sum
)
...
...
@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
"s_nop 1
\n
"
"v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc
\n
"
"s_nop 1
\n
"
#if __AMDGCN_WAVEFRONT_SIZE == 64
"v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa
\n
"
"s_nop 1
\n
"
"v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc
\n
"
#endif
"s_nop 1
\n
"
:
"=v"
(
x
)
:
"0"
(
x
));
...
...
@@ -135,21 +135,27 @@ template <index_int N,
MIGRAPHX_REQUIRES
(
not
std
::
is_integral
<
ForStride
>{})
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
ForStride
fs
,
F
f
)
{
using
type
=
decltype
(
f
(
deduce_for_stride
(
fs
)));
MIGRAPHX_DEVICE_SHARED
type
buffer
[
N
/
64
];
#if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr
index_int
nthreads
=
16
;
#else
constexpr
index_int
nthreads
=
64
;
#endif
using
type
=
decltype
(
f
(
deduce_for_stride
(
fs
)));
MIGRAPHX_DEVICE_SHARED
type
buffer
[
N
/
nthreads
];
type
x
=
init
;
fs
([
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
dpp_reduce
(
x
,
op
);
const
auto
ldsidx
=
idx
.
local
/
64
;
if
((
idx
.
local
%
64
)
==
63
)
const
auto
ldsidx
=
idx
.
local
/
nthreads
;
if
((
idx
.
local
%
nthreads
)
==
nthreads
-
1
)
{
buffer
[
ldsidx
]
=
x
;
}
__syncthreads
();
type
y
=
init
;
for
(
index_int
i
=
0
;
i
<
idx
.
nlocal
()
/
64
;
i
++
)
for
(
index_int
i
=
0
;
i
<
idx
.
nlocal
()
/
nthreads
;
i
++
)
{
y
=
op
(
y
,
buffer
[
i
]);
}
...
...
src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
View file @
7e297b13
...
...
@@ -44,12 +44,19 @@ __device__ void block_scan(index idx, Op op, T init, ForStride fs, Input input,
template
<
index_int
N
,
class
Op
,
class
T
,
class
Input
,
class
Output
>
__device__
void
block_scan
(
index
idx
,
Op
op
,
T
init
,
index_int
n
,
Input
input
,
Output
output
)
{
block_scan
<
N
>
(
idx
,
op
,
init
,
[
&
](
auto
f
)
->
decltype
(
f
(
index_int
{}))
{
return
idx
.
local_stride
(
n
,
f
);
},
input
,
output
);
block_scan
<
N
>
(
idx
,
op
,
init
,
[
&
](
auto
f
)
->
decltype
(
f
(
index_int
{}))
{
return
idx
.
local_stride
(
n
,
f
);
},
input
,
output
);
}
template
<
class
F
>
constexpr
auto
reverse_scan
(
index_int
n
,
F
f
)
{
return
[
=
](
auto
i
,
auto
&&
...
xs
)
{
return
f
(
n
-
i
-
1
,
xs
...);
};
}
}
// namespace device
...
...
src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
View file @
7e297b13
...
...
@@ -129,6 +129,21 @@ __device__ __host__ T to_hip_type(T x)
// Hip doens't support __fp16
inline
__device__
__host__
float
to_hip_type
(
gpu_half
x
)
{
return
x
;
}
#define MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(trait, T) \
template <class X> \
struct trait : std::trait<X> \
{ \
}; \
\
template <> \
struct trait<T> : std::true_type \
{ \
};
MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR
(
is_floating_point
,
__fp16
)
MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR
(
is_signed
,
__fp16
)
MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR
(
is_arithmetic
,
__fp16
)
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
View file @
7e297b13
...
...
@@ -14,28 +14,23 @@ constexpr void visit_tensor_size(index_int n, F f)
{
switch
(
n
)
{
case
1
:
{
case
1
:
{
f
(
std
::
integral_constant
<
index_int
,
1
>
{});
break
;
}
case
2
:
{
case
2
:
{
f
(
std
::
integral_constant
<
index_int
,
2
>
{});
break
;
}
case
3
:
{
case
3
:
{
f
(
std
::
integral_constant
<
index_int
,
3
>
{});
break
;
}
case
4
:
{
case
4
:
{
f
(
std
::
integral_constant
<
index_int
,
4
>
{});
break
;
}
case
5
:
{
case
5
:
{
f
(
std
::
integral_constant
<
index_int
,
5
>
{});
break
;
}
...
...
@@ -181,7 +176,13 @@ template <index_int N, class T, class... Ts>
auto
hip_vec_visit_all
(
T
&&
x
,
Ts
&&
...
xs
)
{
return
[
&
](
auto
f
)
{
hip_visit_all_impl
(
get_shape
(
x
),
auto
sx
=
get_shape
(
x
);
auto
lens
=
sx
.
lens
();
assert
(
lens
.
back
()
%
N
==
0
);
assert
(
sx
.
strides
().
back
()
==
1
);
lens
.
back
()
/=
N
;
shape
vec_sx
{
sx
.
type
(),
lens
};
hip_visit_all_impl
(
vec_sx
,
make_hip_convert
([](
auto
*
p
)
{
return
as_vec
<
N
>
(
device_cast
(
p
));
}),
f
,
x
,
...
...
src/targets/gpu/device/layernorm.cpp
100755 → 100644
View file @
7e297b13
...
...
@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
device
{
#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
#if __AMDGCN_WAVEFRONT_SIZE == 32
#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
#else
#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
#endif
#endif
template
<
class
T
>
struct
vector_type
{
...
...
@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
const
bool
in_range
=
idx
.
local
<
relements_v
;
auto
mean
=
[
&
](
auto
z
)
{
return
auto_block_reduce
<
MaxBlockSize
>
(
idx
,
sum
{},
value_type
(
0
),
relements_v
,
[
=
](
auto
)
{
return
z
;
})
/
value_type
(
relements
);
auto
m
=
auto_block_reduce
<
MaxBlockSize
>
(
idx
,
sum
{},
value_type
(
0
),
relements_v
,
[
=
](
auto
)
{
return
z
;
})
/
value_type
(
relements
);
#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
__builtin_amdgcn_s_barrier
();
#endif
return
m
;
};
// m = x - mean(x)
...
...
src/targets/gpu/device/multinomial.cpp
0 → 100644
View file @
7e297b13
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/gpu/device/multinomial.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
Iterator
,
class
T
>
constexpr
Iterator
upper_bound
(
Iterator
first
,
Iterator
last
,
const
T
&
value
)
{
Iterator
it
;
typename
std
::
iterator_traits
<
Iterator
>::
difference_type
count
;
typename
std
::
iterator_traits
<
Iterator
>::
difference_type
step
;
count
=
std
::
distance
(
first
,
last
);
while
(
count
>
0
)
{
it
=
first
;
step
=
count
/
2
;
std
::
advance
(
it
,
step
);
if
(
!
(
value
<
*
it
))
{
first
=
++
it
;
count
-=
step
+
1
;
}
else
count
=
step
;
}
return
first
;
}
void
multinomial
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg0
,
const
argument
&
arg1
)
{
size_t
batch_size
=
arg0
.
get_shape
().
lens
().
front
();
size_t
class_size
=
arg0
.
get_shape
().
lens
().
back
();
size_t
sample_size
=
result
.
get_shape
().
lens
().
back
();
hip_visit_all
(
arg0
,
arg1
)([
&
](
auto
cdf
,
auto
dist
)
{
result
.
visit
([
&
](
auto
out
)
{
hip_visit_views
(
out
)([
&
](
auto
output
)
{
gs_launch
(
stream
,
batch_size
*
sample_size
)([
=
](
auto
i
)
__device__
{
auto
idx
=
output
.
get_shape
().
multi
(
i
);
auto
cdf_begin
=
cdf
.
begin
()
+
(
idx
.
front
()
*
class_size
);
auto
cdf_end
=
cdf_begin
+
class_size
;
auto
sample_iter
=
upper_bound
(
cdf_begin
,
cdf_end
,
dist
[
i
]
*
*
(
std
::
prev
(
cdf_end
)));
output
[
i
]
=
std
::
distance
(
cdf_begin
,
sample_iter
);
});
});
});
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/nonzero.cpp
0 → 100644
View file @
7e297b13
#include <migraphx/gpu/device/nonzero.hpp>
#include <migraphx/gpu/device/float_equal.hpp>
#include <migraphx/gpu/device/scan.hpp>
#include <migraphx/gpu/device/reduce_ops.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
argument
nonzero
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg_data
)
{
auto
s
=
arg_data
.
get_shape
();
auto
elem_num
=
s
.
elements
();
auto
out_elem_num
=
result
.
get_shape
().
elements
();
// call the prefix_sum function to do a prefix_sum to compute
// index in the output. Only 1 block can be used since we have
// only one prefix sum
const
index_int
block_size
=
256
;
hip_visit_all
(
arg_data
,
s
)([
&
](
auto
input
,
auto
si
)
{
const
auto
*
in_ptr
=
device_cast
(
input
.
data
());
auto
*
ptr
=
result
.
cast
<
int64_t
>
();
gs_launch
(
stream
,
block_size
,
block_size
)([
=
](
auto
,
auto
idx
)
__device__
{
// fill all output to 0 first
idx
.
local_stride
(
out_elem_num
,
[
&
](
auto
j
)
{
ptr
[
j
]
=
0
;
});
block_scan
<
block_size
>
(
idx
,
sum
{},
0
,
elem_num
,
[
&
](
auto
j
)
{
return
(
float_equal
(
in_ptr
[
j
],
0
))
?
0
:
1
;
},
[
&
](
auto
j
,
auto
x
)
{
auto
out_loc
=
x
-
1
;
if
(
float_equal
(
in_ptr
[
j
],
0
))
return
;
auto
index
=
si
.
multi
(
j
);
for
(
size_t
k
=
0
;
k
<
index
.
size
();
++
k
)
{
ptr
[
k
*
elem_num
+
out_loc
]
=
index
[
k
];
}
});
});
});
return
result
;
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/prefix_scan_sum.cpp
View file @
7e297b13
#include <migraphx/gpu/device/prefix_scan_sum.hpp>
#include <migraphx/gpu/device/scan.hpp>
#include <migraphx/gpu/device/reduce_ops.hpp>
#include <migraphx/gpu/device/reduce.hpp>
#include <migraphx/gpu/device/types.hpp>
namespace
migraphx
{
...
...
@@ -8,29 +9,108 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
device
{
void
prefix_scan_sum
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
,
int32_t
axis
)
void
prefix_scan_sum
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg
,
int32_t
axis
,
bool
exclusive
,
bool
reverse
)
{
const
index_int
block_size
=
256
;
const
index_int
n
=
arg
.
get_shape
().
lens
()[
axis
];
auto
rlens
=
result
.
get_shape
().
lens
();
rlens
[
axis
]
=
1
;
const
index_int
max_block_size
=
256
;
const
index_int
n
=
arg
.
get_shape
().
lens
()[
axis
];
auto
rlens
=
result
.
get_shape
().
lens
();
rlens
[
axis
]
=
1
;
hip_visit_all
(
result
,
arg
,
result
.
get_shape
().
with_lens
(
rlens
))(
[
=
](
auto
output
,
auto
input
,
auto
rshape
)
{
gs_launch
(
stream
,
rshape
.
elements
()
*
block_size
,
block_size
)(
[
=
](
auto
i
,
auto
idx
)
__device__
{
const
auto
ridx
=
rshape
.
multi
(
i
/
block_size
);
auto
compute_idx
=
[
&
](
auto
j
)
{
auto
k
=
ridx
;
k
[
axis
]
=
j
;
return
k
;
};
block_scan
<
block_size
>
(
idx
,
sum
{},
0
,
n
,
[
&
](
auto
j
)
{
return
input
[
compute_idx
(
j
)];
},
[
&
](
auto
j
,
auto
x
)
{
output
[
compute_idx
(
j
)]
=
x
;
});
});
const
index_int
block_size
=
compute_block_size
(
rshape
.
elements
(),
max_block_size
);
if
(
reverse
and
exclusive
)
{
gs_launch
(
stream
,
rshape
.
elements
()
*
block_size
,
block_size
)(
[
=
](
auto
i
,
auto
idx
)
__device__
{
const
auto
ridx
=
rshape
.
multi
(
i
/
block_size
);
auto
compute_idx
=
[
&
](
auto
j
)
{
auto
k
=
ridx
;
k
[
axis
]
=
j
;
return
k
;
};
block_scan
<
max_block_size
>
(
idx
,
sum
{},
0
,
n
,
reverse_scan
(
n
,
[
&
](
auto
j
)
{
return
input
[
compute_idx
(
j
)];
}),
reverse_scan
(
n
,
[
&
](
auto
j
,
auto
x
)
{
if
(
j
==
n
-
1
)
output
[
compute_idx
(
j
)]
=
0
;
if
(
j
>
0
)
output
[
compute_idx
(
j
-
1
)]
=
x
;
}));
});
}
else
if
(
reverse
)
{
gs_launch
(
stream
,
rshape
.
elements
()
*
block_size
,
block_size
)(
[
=
](
auto
i
,
auto
idx
)
__device__
{
const
auto
ridx
=
rshape
.
multi
(
i
/
block_size
);
auto
compute_idx
=
[
&
](
auto
j
)
{
auto
k
=
ridx
;
k
[
axis
]
=
j
;
return
k
;
};
block_scan
<
max_block_size
>
(
idx
,
sum
{},
0
,
n
,
reverse_scan
(
n
,
[
&
](
auto
j
)
{
return
input
[
compute_idx
(
j
)];
}),
reverse_scan
(
n
,
[
&
](
auto
j
,
auto
x
)
{
output
[
compute_idx
(
j
)]
=
x
;
}));
});
}
else
if
(
exclusive
)
{
gs_launch
(
stream
,
rshape
.
elements
()
*
block_size
,
block_size
)(
[
=
](
auto
i
,
auto
idx
)
__device__
{
const
auto
ridx
=
rshape
.
multi
(
i
/
block_size
);
auto
compute_idx
=
[
&
](
auto
j
)
{
auto
k
=
ridx
;
k
[
axis
]
=
j
;
return
k
;
};
block_scan
<
max_block_size
>
(
idx
,
sum
{},
0
,
n
,
[
&
](
auto
j
)
{
return
input
[
compute_idx
(
j
)];
},
[
&
](
auto
j
,
auto
x
)
{
auto
k
=
j
+
1
;
if
(
j
==
0
)
output
[
compute_idx
(
0
)]
=
0
;
if
(
k
<
n
)
output
[
compute_idx
(
k
)]
=
x
;
});
});
}
else
{
gs_launch
(
stream
,
rshape
.
elements
()
*
block_size
,
block_size
)(
[
=
](
auto
i
,
auto
idx
)
__device__
{
const
auto
ridx
=
rshape
.
multi
(
i
/
block_size
);
auto
compute_idx
=
[
&
](
auto
j
)
{
auto
k
=
ridx
;
k
[
axis
]
=
j
;
return
k
;
};
block_scan
<
max_block_size
>
(
idx
,
sum
{},
0
,
n
,
[
&
](
auto
j
)
{
return
input
[
compute_idx
(
j
)];
},
[
&
](
auto
j
,
auto
x
)
{
output
[
compute_idx
(
j
)]
=
x
;
});
});
}
});
}
...
...
src/targets/gpu/device/softmax.cpp
View file @
7e297b13
...
...
@@ -20,34 +20,58 @@ void softmax(hipStream_t stream, const argument& result, const argument& arg, in
migraphx
::
shape
batch_shape
{
result
.
get_shape
().
type
(),
batch_lens
};
hip_visit_all
(
result
,
arg
,
batch_shape
)([
&
](
auto
output
,
auto
input
,
auto
batch
)
{
const
index_int
max_block_size
=
256
;
const
index_int
max_block_size
=
128
;
const
index_int
block_size
=
compute_block_size
(
batch_item_num
,
max_block_size
);
gs_launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)([
=
](
auto
i
,
auto
idx
)
__device__
{
auto
data_idx
=
batch
.
multi
(
i
/
block_size
);
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
input
)
::
value_type
>>
;
type
init
=
lowest
();
auto
batch_max
=
block_reduce
<
max_block_size
>
(
idx
,
max
{},
init
,
batch_item_num
,
[
&
](
auto
j
)
__device__
{
data_idx
[
axis
]
=
j
;
return
input
[
data_idx
];
});
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
input
)
::
value_type
>>
;
type
init
=
lowest
();
if
(
axis
==
batch_lens
.
size
()
-
1
)
{
gs_launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)(
[
=
](
auto
i
,
auto
idx
)
__device__
{
auto
start_loc
=
i
/
block_size
*
batch_item_num
;
auto
batch_max
=
block_reduce
<
max_block_size
>
(
idx
,
max
{},
init
,
batch_item_num
,
[
&
](
auto
j
)
__device__
{
return
input
[
start_loc
+
j
];
});
auto
batch_sum
=
block_reduce
<
max_block_size
>
(
idx
,
sum
{},
0
,
batch_item_num
,
[
&
](
auto
j
)
__device__
{
auto
val
=
input
[
start_loc
+
j
]
-
batch_max
;
return
::
exp
(
to_hip_type
(
val
));
});
auto
batch_sum
=
block_reduce
<
max_block_size
>
(
idx
,
sum
{},
0
,
batch_item_num
,
[
&
](
auto
j
)
__device__
{
data_idx
[
axis
]
=
j
;
auto
val
=
input
[
data_idx
]
-
batch_max
;
return
::
exp
(
to_hip_type
(
val
));
idx
.
local_stride
(
batch_item_num
,
[
&
](
auto
j
)
__device__
{
auto
val
=
input
[
start_loc
+
j
]
-
batch_max
;
output
[
start_loc
+
j
]
=
::
exp
(
to_hip_type
(
val
))
/
batch_sum
;
});
});
}
else
{
gs_launch
(
stream
,
batch_shape
.
elements
()
*
block_size
,
block_size
)(
[
=
](
auto
i
,
auto
idx
)
__device__
{
auto
data_idx
=
batch
.
multi
(
i
/
block_size
);
auto
batch_max
=
block_reduce
<
max_block_size
>
(
idx
,
max
{},
init
,
batch_item_num
,
[
&
](
auto
j
)
__device__
{
data_idx
[
axis
]
=
j
;
return
input
[
data_idx
];
});
idx
.
local_stride
(
batch_item_num
,
[
&
](
auto
j
)
__device__
{
data_idx
[
axis
]
=
j
;
auto
val
=
input
[
data_idx
]
-
batch_max
;
output
[
data_idx
]
=
::
exp
(
to_hip_type
(
val
))
/
batch_sum
;
});
});
auto
batch_sum
=
block_reduce
<
max_block_size
>
(
idx
,
sum
{},
0
,
batch_item_num
,
[
&
](
auto
j
)
__device__
{
data_idx
[
axis
]
=
j
;
auto
val
=
input
[
data_idx
]
-
batch_max
;
return
::
exp
(
to_hip_type
(
val
));
});
idx
.
local_stride
(
batch_item_num
,
[
&
](
auto
j
)
__device__
{
data_idx
[
axis
]
=
j
;
auto
val
=
input
[
data_idx
]
-
batch_max
;
output
[
data_idx
]
=
::
exp
(
to_hip_type
(
val
))
/
batch_sum
;
});
});
}
});
}
...
...
src/targets/gpu/device/topk.cpp
0 → 100644
View file @
7e297b13
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/device/topk.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/launch.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/visit.hpp>
#include <migraphx/ranges.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
T
,
class
Index
,
class
Compare
>
struct
hip_heap_vector
{
MIGRAPHX_DEVICE_CONSTEXPR
hip_heap_vector
(
T
*
val
,
index_int
n
,
Index
v_idx
,
Compare
comp
)
:
data
(
val
),
size
(
n
),
data_index
(
v_idx
),
compare
(
comp
)
{
make_heap
(
size
);
}
MIGRAPHX_DEVICE_CONSTEXPR
void
try_push
(
const
T
val
)
{
if
(
compare
(
val
,
data
[
data_index
(
0
)]))
return
;
pop_heap
(
size
-
1
);
data
[
data_index
(
size
-
1
)]
=
val
;
push_heap
(
size
-
1
);
}
MIGRAPHX_DEVICE_CONSTEXPR
void
sort
()
{
sort_heap
(
size
);
}
private:
MIGRAPHX_DEVICE_CONSTEXPR
inline
static
void
swap
(
T
&
v1
,
T
&
v2
)
{
T
v
=
v1
;
v1
=
v2
;
v2
=
v
;
}
MIGRAPHX_DEVICE_CONSTEXPR
inline
void
heapify_down
(
index_int
n
,
index_int
index
)
{
while
(
index
<
n
)
{
auto
pre_index
=
index
;
index_int
l
=
2
*
index
+
1
;
index_int
r
=
2
*
index
+
2
;
if
(
l
<
n
&&
compare
(
data
[
data_index
(
l
)],
data
[
data_index
(
index
)]))
{
index
=
l
;
}
if
(
r
<
n
&&
compare
(
data
[
data_index
(
r
)],
data
[
data_index
(
index
)]))
{
index
=
r
;
if
(
compare
(
data
[
data_index
(
l
)],
data
[
data_index
(
r
)]))
{
index
=
l
;
}
}
if
(
index
==
pre_index
)
{
break
;
}
swap
(
data
[
data_index
(
index
)],
data
[
data_index
(
pre_index
)]);
}
}
MIGRAPHX_DEVICE_CONSTEXPR
inline
void
heapify_up
(
index_int
index
)
{
while
(
index
>
0
)
{
auto
parent_idx
=
(
index
-
1
)
/
2
;
if
(
not
compare
(
data
[
data_index
(
index
)],
data
[
data_index
(
parent_idx
)]))
{
break
;
}
swap
(
data
[
data_index
(
index
)],
data
[
data_index
(
parent_idx
)]);
index
=
parent_idx
;
}
}
MIGRAPHX_DEVICE_CONSTEXPR
inline
void
make_heap
(
index_int
n
)
{
for
(
int
j
=
n
/
2
-
1
;
j
>=
0
;
--
j
)
{
heapify_down
(
n
,
j
);
}
}
MIGRAPHX_DEVICE_CONSTEXPR
inline
void
push_heap
(
index_int
loc
)
{
heapify_up
(
loc
);
}
MIGRAPHX_DEVICE_CONSTEXPR
inline
void
pop_heap
(
index_int
loc
)
{
swap
(
data
[
data_index
(
0
)],
data
[
data_index
(
loc
)]);
heapify_down
(
loc
,
0
);
}
MIGRAPHX_DEVICE_CONSTEXPR
inline
void
sort_heap
(
index_int
n
)
{
for
(
int
j
=
n
-
1
;
j
>
0
;
--
j
)
{
swap
(
data
[
data_index
(
0
)],
data
[
data_index
(
j
)]);
heapify_down
(
j
,
0
);
}
}
T
*
data
=
nullptr
;
index_int
size
;
Index
data_index
;
Compare
compare
;
};
template
<
class
T
,
class
Index
,
class
Compare
>
__device__
hip_heap_vector
<
T
,
Index
,
Compare
>
make_heap
(
T
*
data
,
index_int
n
,
Index
idx
,
Compare
compare
)
{
return
{
data
,
n
,
idx
,
compare
};
}
template
<
class
Compare
>
std
::
vector
<
argument
>
topk
(
hipStream_t
stream
,
const
argument
&
val_res
,
const
argument
&
ind_res
,
const
argument
&
arg
,
int64_t
k
,
int64_t
axis
,
Compare
compare
)
{
auto
in_s
=
arg
.
get_shape
();
auto
in_lens
=
in_s
.
lens
();
auto
out_s
=
val_res
.
get_shape
();
auto
axis_dim
=
in_s
.
lens
()[
axis
];
auto
comp_lens
=
in_lens
;
comp_lens
[
axis
]
=
1
;
shape
comp_s
{
in_s
.
type
(),
comp_lens
};
std
::
size_t
elem_num
=
comp_s
.
elements
();
hip_visit_all
(
val_res
,
arg
,
out_s
,
in_s
,
comp_s
)(
[
&
](
auto
out_val
,
auto
input
,
auto
oss
,
auto
iss
,
auto
css
)
{
auto
*
data
=
device_cast
(
input
.
data
());
auto
*
out
=
device_cast
(
out_val
.
data
());
auto
*
const
ind
=
ind_res
.
cast
<
int64_t
>
();
gs_launch
(
stream
,
elem_num
)([
=
](
auto
i
)
__device__
{
auto
idx
=
css
.
multi
(
i
);
auto
in_idx
=
[
&
](
int
ii
)
{
auto
iidx
=
idx
;
iidx
[
axis
]
=
ii
;
return
iss
.
index
(
iidx
);
};
auto
out_idx
=
[
&
](
int
ii
)
{
auto
iidx
=
idx
;
iidx
[
axis
]
=
ii
;
return
oss
.
index
(
iidx
);
};
auto
data_compare
=
[
=
](
auto
ii
,
auto
jj
)
{
return
compare
(
data
[
in_idx
(
ii
)],
data
[
in_idx
(
jj
)]);
};
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
ind
[
out_idx
(
j
)]
=
j
;
}
auto
hp
=
make_heap
(
ind
,
k
,
out_idx
,
data_compare
);
for
(
int
j
=
k
;
j
<
axis_dim
;
++
j
)
{
hp
.
try_push
(
j
);
}
hp
.
sort
();
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
out
[
out_idx
(
j
)]
=
data
[
in_idx
(
ind
[
out_idx
(
j
)])];
}
});
});
return
{
val_res
,
ind_res
};
}
argument
topk_largest
(
hipStream_t
stream
,
const
argument
&
val_res
,
const
argument
&
ind_res
,
const
argument
&
arg
,
int64_t
k
,
int64_t
axis
)
{
return
{
topk
(
stream
,
val_res
,
ind_res
,
arg
,
k
,
axis
,
std
::
less
<>
{})};
}
argument
topk_smallest
(
hipStream_t
stream
,
const
argument
&
val_res
,
const
argument
&
ind_res
,
const
argument
&
arg
,
int64_t
k
,
int64_t
axis
)
{
return
{
topk
(
stream
,
val_res
,
ind_res
,
arg
,
k
,
axis
,
std
::
greater
<>
{})};
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/device/where.cpp
0 → 100644
View file @
7e297b13
#include <migraphx/gpu/device/where.hpp>
#include <migraphx/gpu/device/tensor.hpp>
#include <migraphx/gpu/device/types.hpp>
#include <migraphx/gpu/device/launch.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
namespace
device
{
template
<
class
Shape
>
constexpr
auto
get_rank
(
const
Shape
&
)
{
return
decltype
(
typename
Shape
::
hip_index
{}.
size
()){};
}
void
where
(
hipStream_t
stream
,
const
argument
&
result
,
const
argument
&
arg0
,
const
argument
&
arg1
,
const
argument
&
arg2
)
{
hip_visit_all
(
result
,
arg1
,
arg2
)([
&
](
auto
output
,
auto
x
,
auto
y
)
{
hip_visit_all
(
arg0
)([
&
](
auto
cond
)
{
if
constexpr
(
get_rank
(
cond
.
get_shape
())
==
get_rank
(
output
.
get_shape
()))
{
gs_launch
(
stream
,
arg1
.
get_shape
().
elements
())([
=
](
auto
idx
)
__device__
{
auto
i
=
output
.
get_shape
().
multi
(
idx
);
output
[
i
]
=
cond
[
i
]
?
x
[
i
]
:
y
[
i
];
});
}
});
});
}
}
// namespace device
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/driver/CMakeLists.txt
View file @
7e297b13
file
(
GLOB GPU_DRIVER_SRCS
${
CONFIGURE_DEPENDS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.cpp
)
add_executable
(
gpu-driver
action.cpp
compile_pointwise.cpp
main.cpp
parser.cpp
perf.cpp
run_op.cpp
${
GPU_DRIVER_SRCS
}
)
target_include_directories
(
gpu-driver PRIVATE include
)
target_link_libraries
(
gpu-driver PRIVATE migraphx_gpu
)
src/targets/gpu/driver/compile_p
ointwise
.cpp
→
src/targets/gpu/driver/compile_
o
p.cpp
100755 → 100644
View file @
7e297b13
#include <migraphx/gpu/driver/action.hpp>
#include <migraphx/gpu/driver/perf.hpp>
#include <migraphx/gpu/compile
_pointwise
.hpp>
#include <migraphx/gpu/compile
r
.hpp>
#include <migraphx/gpu/context.hpp>
namespace
migraphx
{
...
...
@@ -8,13 +8,13 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
namespace
driver
{
struct
compile_p
ointwise
:
action
<
compile_p
ointwise
>
struct
compile_
o
p
:
action
<
compile_
o
p
>
{
static
void
apply
(
const
parser
&
p
,
const
value
&
v
)
{
context
ctx
;
auto
inputs
=
p
.
parse_shapes
(
v
.
at
(
"inputs"
));
auto
op
=
gpu
::
compile_
pointwise
(
ctx
,
inputs
,
v
.
at
(
"
l
am
bda
"
).
to
<
std
::
string
>
());
auto
op
=
gpu
::
compile_
op
(
v
.
at
(
"
n
am
e
"
).
to
<
std
::
string
>
()
,
ctx
,
inputs
,
v
);
double
t
=
time_op
(
ctx
,
op
,
inputs
,
p
.
get
(
v
,
"iterations"
,
100
));
std
::
cout
<<
op
<<
": "
<<
t
<<
"ms"
<<
std
::
endl
;
}
...
...
src/targets/gpu/driver/main.cpp
View file @
7e297b13
...
...
@@ -2,6 +2,7 @@
#include <migraphx/json.hpp>
#include <migraphx/convert_to_json.hpp>
#include <migraphx/file_buffer.hpp>
#include <iostream>
using
namespace
migraphx
;
// NOLINT
using
namespace
migraphx
::
gpu
;
// NOLINT
...
...
Prev
1
…
14
15
16
17
18
19
20
21
22
…
39
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment