Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
2f268bc2
Commit
2f268bc2
authored
Jun 12, 2022
by
Paul
Browse files
Merge branch 'develop' into mlir-c
parents
f75c5a38
aa7ff911
Changes
205
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1043 additions
and
116 deletions
+1043
-116
src/targets/gpu/jit/pointwise.cpp
src/targets/gpu/jit/pointwise.cpp
+42
-25
src/targets/gpu/jit/reduce.cpp
src/targets/gpu/jit/reduce.cpp
+179
-0
src/targets/gpu/jit/roialign.cpp
src/targets/gpu/jit/roialign.cpp
+0
-1
src/targets/gpu/jit/scatternd.cpp
src/targets/gpu/jit/scatternd.cpp
+1
-3
src/targets/gpu/kernel.cpp
src/targets/gpu/kernel.cpp
+2
-0
src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
...argets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
+39
-0
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+3
-2
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+78
-5
src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
+55
-0
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
...rgets/gpu/kernels/include/migraphx/kernels/functional.hpp
+46
-20
src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
...targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
+81
-0
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+10
-9
src/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
...ts/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
+1
-1
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+74
-12
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
+83
-0
src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
...argets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+8
-11
src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+45
-1
src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+4
-0
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+266
-0
src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
...targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+26
-26
No files found.
src/targets/gpu/jit/pointwise.cpp
View file @
2f268bc2
...
...
@@ -2,10 +2,12 @@
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/permutation.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
...
...
@@ -16,6 +18,8 @@ namespace migraphx {
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
using
namespace
migraphx
::
gpu
::
gen
;
// NOLINT
static
const
char
*
const
pointwise_kernel
=
R"__migraphx__(
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/pointwise.hpp>
...
...
@@ -26,9 +30,10 @@ namespace migraphx {
${preamble}
extern "C" {
__global__ void kernel(${params})
__global__ void
${
kernel
}
(${params})
{
pointwise(${lambda}, ${args});
auto idx = make_index();
pointwise(idx, ${transformers})(${lambda}, ${args});
}
}
...
...
@@ -37,44 +42,51 @@ __global__ void kernel(${params})
)__migraphx__"
;
static
std
::
vector
<
std
::
string
>
get_op_names
(
const
module
&
m
)
{
std
::
vector
<
std
::
string
>
result
;
for
(
auto
&
ins
:
m
)
{
if
(
starts_with
(
ins
.
name
(),
"@"
))
continue
;
result
.
push_back
(
ins
.
name
());
}
return
result
;
}
struct
pointwise_compiler
:
compiler
<
pointwise_compiler
>
{
std
::
vector
<
std
::
string
>
names
()
const
{
return
{
"pointwise"
};
}
static
std
::
size_t
oversubscribe
(
const
std
::
vector
<
shape
>&
inputs
)
static
std
::
size_t
oversubscribe
_if
(
bool
b
)
{
if
(
std
::
any_of
(
inputs
.
begin
(),
inputs
.
end
(),
[](
const
auto
&
s
)
{
return
s
.
broadcasted
();
})
)
return
1
;
if
(
b
)
return
256
;
else
return
4
;
}
static
std
::
size_t
vectorize_elements
(
const
std
::
vector
<
shape
>&
inputs
)
{
std
::
size_t
n
=
inputs
.
front
().
elements
();
if
(
std
::
all_of
(
inputs
.
begin
(),
inputs
.
end
(),
[](
const
auto
&
s
)
{
return
s
.
packed
()
or
s
.
broadcasted
();
}))
{
if
((
n
%
4
)
==
0
)
return
n
/
4
;
else
if
((
n
%
2
)
==
0
)
return
n
/
2
;
}
return
n
;
return
1
;
}
operation
compile_op
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
const
value
&
v
)
const
{
hip_compile_options
options
;
options
.
set_launch_params
(
v
,
compute_global_for
(
ctx
,
vectorize_elements
(
inputs
),
oversubscribe
(
inputs
)));
options
.
inputs
=
inputs
;
options
.
output
=
inputs
.
back
();
options
.
virtual_inputs
=
reduce_dims
(
inputs
);
options
.
params
=
"-Wno-float-equal"
;
auto
src
=
interpolate_string
(
pointwise_kernel
,
{{
"params"
,
enum_params
(
inputs
.
size
(),
"void * private_p"
)},
auto
axis
=
find_fast_axis
(
options
.
virtual_inputs
);
auto
vec
=
vectorize
::
elements
(
axis
,
options
.
virtual_inputs
);
auto
preloads
=
preload
::
broadcasts
(
axis
,
options
.
virtual_inputs
);
options
.
kernel_name
=
v
.
get
(
"kernel"
,
"kernel"
);
options
.
set_launch_params
(
v
,
compute_global_for
(
ctx
,
options
.
output
.
elements
()
/
vec
.
size
,
oversubscribe_if
(
not
preloads
.
is_preloading
())));
auto
src
=
interpolate_string
(
pointwise_kernel
,
{{
"kernel"
,
options
.
kernel_name
},
{
"params"
,
enum_params
(
inputs
.
size
(),
"void * private_p"
)},
{
"args"
,
enum_params
(
inputs
.
size
(),
"private_p"
)},
{
"lambda"
,
v
.
at
(
"lambda"
).
to
<
std
::
string
>
()},
{
"transformers"
,
make_transformer_args
(
preloads
,
vec
)},
{
"preamble"
,
v
.
get
(
"preamble"
,
std
::
string
{})}});
return
compile_hip_code_object
(
src
,
options
);
}
...
...
@@ -100,8 +112,13 @@ struct pointwise_compiler : compiler<pointwise_compiler>
auto
name
=
g
.
create_function
(
g
.
generate_module
(
*
pm
).
set_attributes
({
"__device__"
}).
set_generic_types
(
*
pm
));
std
::
string
lambda
=
"MIGRAPHX_LIFT("
+
name
+
")"
;
auto
op_names
=
get_op_names
(
*
pm
);
op_names
.
push_back
(
"kernel"
);
auto
op_name_string
=
join_strings
(
op_names
,
"_"
);
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
{{
"lambda"
,
lambda
},
{
"preamble"
,
g
.
str
()}}));
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
{{
"lambda"
,
lambda
},
{
"preamble"
,
g
.
str
()},
{
"kernel"
,
op_name_string
}}));
}
};
}
// namespace gpu
...
...
src/targets/gpu/jit/reduce.cpp
0 → 100644
View file @
2f268bc2
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/compile_gen.hpp>
#include <migraphx/cpp_generator.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/eliminate_common_subexpression.hpp>
#include <migraphx/module.hpp>
#include <migraphx/pass_manager.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
using
namespace
migraphx
::
gpu
::
gen
;
// NOLINT
static
const
char
*
const
simple_reduce_kernel
=
R"__migraphx__(
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/reduce.hpp>
#include <migraphx/kernels/vectorize.hpp>
#include <args.hpp>
namespace migraphx {
${preamble}
extern "C" {
__global__ void reduce_kernel(void* input_p, void* output_p)
{
transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
simple_reduce<reduce::${algo}>(${reduction}, ${init}, input, output, ${read}, ${write});
});
}
}
} // namespace migraphx
)__migraphx__"
;
static
std
::
size_t
get_reduce_elements
(
const
std
::
vector
<
shape
>&
inputs
)
{
return
inputs
.
front
().
elements
()
/
inputs
.
back
().
elements
();
}
static
std
::
size_t
get_reduce_elements
(
const
std
::
vector
<
instruction_ref
>&
inputs
)
{
return
get_reduce_elements
(
to_shapes
(
inputs
));
}
static
std
::
vector
<
std
::
size_t
>
get_reduce_lens
(
const
std
::
vector
<
std
::
size_t
>&
input_lens
,
const
std
::
vector
<
std
::
size_t
>&
output_lens
)
{
std
::
vector
<
std
::
size_t
>
reduce_lens
;
std
::
transform
(
output_lens
.
begin
(),
output_lens
.
end
(),
input_lens
.
begin
(),
std
::
back_inserter
(
reduce_lens
),
[](
auto
x
,
auto
y
)
->
std
::
size_t
{
if
(
x
==
y
)
return
1
;
else
return
y
;
});
return
reduce_lens
;
}
static
std
::
string
get_reduce_algo
(
const
std
::
vector
<
shape
>&
inputs
)
{
auto
rlens
=
get_reduce_lens
(
inputs
.
front
().
lens
(),
inputs
.
back
().
lens
());
const
auto
init
=
std
::
numeric_limits
<
std
::
size_t
>::
max
();
// The minimum stride
auto
min_stride
=
std
::
inner_product
(
rlens
.
begin
(),
rlens
.
end
(),
inputs
.
front
().
strides
().
begin
(),
init
,
[](
auto
x
,
auto
y
)
{
return
std
::
min
(
x
,
y
);
},
[](
auto
len
,
auto
stride
)
{
return
len
==
1
?
init
:
stride
;
});
if
(
min_stride
>
2
)
return
"lane"
;
return
"block"
;
}
struct
reduce_compiler
:
compiler
<
reduce_compiler
>
{
std
::
vector
<
std
::
string
>
names
()
const
{
return
{
"reduce"
,
"reduce_sum"
,
"reduce_mean"
,
"reduce_max"
,
"reduce_min"
,
"reduce_prod"
};
}
operation
compile_op
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
inputs
,
const
value
&
v
)
const
{
hip_compile_options
options
;
options
.
inputs
=
inputs
;
options
.
output
=
inputs
.
back
();
options
.
virtual_inputs
=
reduce_dims
(
inputs
);
auto
faxis
=
find_fast_axis
({
options
.
virtual_inputs
.
front
()});
vectorize
vec
{};
// Vectorize if the axis is a reduction axis
if
(
options
.
virtual_inputs
.
back
().
lens
()[
faxis
]
==
1
)
{
vec
=
vectorize
::
elements
(
faxis
,
options
.
virtual_inputs
);
}
auto
relements
=
get_reduce_elements
(
options
.
virtual_inputs
)
/
vec
.
size
;
auto
nelements
=
options
.
virtual_inputs
.
back
().
elements
();
auto
algo
=
v
.
get
(
"algo"
,
get_reduce_algo
(
options
.
virtual_inputs
));
if
(
algo
==
"block"
)
{
auto
block_size
=
compute_block_size
(
relements
,
256
);
options
.
set_launch_params
(
v
,
compute_global_for
(
ctx
,
nelements
*
block_size
,
256
),
block_size
);
}
else
if
(
algo
==
"lane"
)
{
options
.
set_launch_params
(
v
,
compute_global_for
(
ctx
,
nelements
,
256
));
}
else
{
MIGRAPHX_THROW
(
"Unknown reduce algo: "
+
algo
);
}
options
.
kernel_name
=
"reduce_kernel"
;
std
::
string
identity
=
"[](auto x) { return x; }"
;
auto
src
=
interpolate_string
(
simple_reduce_kernel
,
{{
"reduction"
,
v
.
at
(
"reduction"
).
to
<
std
::
string
>
()},
{
"init"
,
v
.
get
(
"init"
,
std
::
string
{
"0"
})},
{
"read"
,
v
.
get
(
"read"
,
identity
)},
{
"write"
,
v
.
get
(
"write"
,
identity
)},
{
"algo"
,
algo
},
{
"transformers"
,
make_transformer_args
(
vec
)},
{
"preamble"
,
v
.
get
(
"preamble"
,
std
::
string
{})}});
options
.
params
+=
"-Wno-float-equal"
;
return
compile_hip_code_object
(
src
,
options
);
}
compiler_replace
compile
(
context
&
ctx
,
instruction_ref
ins
,
const
operation
&
op
)
const
{
value
v
=
value
::
object
{};
auto
reduce_elements
=
get_reduce_elements
(
ins
->
inputs
());
if
(
op
.
name
()
==
"reduce_sum"
)
{
v
[
"reduction"
]
=
"op::sum{}"
;
}
else
if
(
op
.
name
()
==
"reduce_mean"
)
{
v
[
"reduction"
]
=
"op::sum{}"
;
v
[
"write"
]
=
"op::mean{"
+
std
::
to_string
(
reduce_elements
)
+
"}"
;
}
else
if
(
op
.
name
()
==
"reduce_max"
)
{
v
[
"reduction"
]
=
"op::max{}"
;
v
[
"init"
]
=
"lowest{}"
;
}
else
if
(
op
.
name
()
==
"reduce_min"
)
{
v
[
"reduction"
]
=
"op::min{}"
;
v
[
"init"
]
=
"highest{}"
;
}
else
if
(
op
.
name
()
==
"reduce_prod"
)
{
v
[
"reduction"
]
=
"op::product{}"
;
v
[
"init"
]
=
"1"
;
}
else
{
MIGRAPHX_THROW
(
"Unsupported reduce"
);
}
return
replace
(
compile_op
(
ctx
,
to_shapes
(
ins
->
inputs
()),
v
));
}
};
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
}
// namespace migraphx
src/targets/gpu/jit/roialign.cpp
View file @
2f268bc2
...
...
@@ -19,7 +19,6 @@ namespace gpu {
// NOLINTNEXTLINE
static
const
char
*
const
roialign_kernel
=
R"__migraphx__(
#include <migraphx/kernels/roialign.hpp>
#include <migraphx/kernels/basic_ops.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/generic_constant.hpp>
#include <args.hpp>
...
...
src/targets/gpu/jit/scatternd.cpp
View file @
2f268bc2
...
...
@@ -19,7 +19,6 @@ namespace gpu {
// NOLINTNEXTLINE
static
const
char
*
const
scatternd_kernel
=
R"__migraphx__(
#include <migraphx/kernels/scatternd.hpp>
#include <migraphx/kernels/basic_ops.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/generic_constant.hpp>
#include <args.hpp>
...
...
@@ -52,9 +51,8 @@ struct scatternd_compiler : compiler<scatternd_compiler>
{
hip_compile_options
options
;
options
.
set_launch_params
(
v
,
compute_global_for
(
ctx
,
inputs
.
at
(
1
).
elements
()));
auto
out_s
=
inputs
.
back
();
options
.
inputs
=
inputs
;
options
.
output
=
out_s
;
options
.
output
=
inputs
.
back
()
;
options
.
kernel_name
=
"scatternd_kernel"
;
options
.
virtual_inputs
=
inputs
;
auto
reduction
=
"assign_"
+
v
.
get
(
"reduction"
,
std
::
string
{
"none"
});
...
...
src/targets/gpu/kernel.cpp
View file @
2f268bc2
...
...
@@ -59,6 +59,8 @@ void launch_kernel(hipFunction_t fun,
void
*
kernargs
,
std
::
size_t
size
)
{
assert
(
global
>
0
);
assert
(
local
>
0
);
void
*
config
[]
=
{
// HIP_LAUNCH_PARAM_* are macros that do horrible things
#ifdef MIGRAPHX_USE_CLANG_TIDY
...
...
src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
View file @
2f268bc2
...
...
@@ -21,6 +21,16 @@ struct greater
}
};
template
<
class
InputIt
,
class
T
,
class
BinaryOperation
>
constexpr
T
accumulate
(
InputIt
first
,
InputIt
last
,
T
init
,
BinaryOperation
op
)
{
for
(;
first
!=
last
;
++
first
)
{
init
=
op
(
std
::
move
(
init
),
*
first
);
}
return
init
;
}
template
<
class
InputIt
,
class
OutputIt
>
constexpr
OutputIt
copy
(
InputIt
first
,
InputIt
last
,
OutputIt
d_first
)
{
...
...
@@ -106,6 +116,35 @@ constexpr Iterator1 search(Iterator1 first, Iterator1 last, Iterator2 s_first, I
}
}
template
<
class
InputIt1
,
class
InputIt2
,
class
T
,
class
BinaryOperation1
,
class
BinaryOperation2
>
constexpr
T
inner_product
(
InputIt1
first1
,
InputIt1
last1
,
InputIt2
first2
,
T
init
,
BinaryOperation1
op1
,
BinaryOperation2
op2
)
{
while
(
first1
!=
last1
)
{
init
=
op1
(
init
,
op2
(
*
first1
,
*
first2
));
++
first1
;
++
first2
;
}
return
init
;
}
template
<
class
InputIt1
,
class
InputIt2
,
class
T
>
constexpr
T
inner_product
(
InputIt1
first1
,
InputIt1
last1
,
InputIt2
first2
,
T
init
)
{
return
inner_product
(
first1
,
last1
,
first2
,
init
,
[](
auto
x
,
auto
y
)
{
return
x
+
y
;
},
[](
auto
x
,
auto
y
)
{
return
x
*
y
;
});
}
}
// namespace migraphx
#endif
src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
View file @
2f268bc2
...
...
@@ -74,6 +74,7 @@ struct array
constexpr
const
T
*
data
()
const
{
return
d
;
}
constexpr
index_constant
<
N
>
size
()
const
{
return
{};
}
constexpr
auto
empty
()
const
{
return
size
()
==
_c
<
0
>
;
}
constexpr
T
*
begin
()
{
return
d
;
}
constexpr
const
T
*
begin
()
const
{
return
d
;
}
...
...
@@ -145,8 +146,8 @@ struct array
constexpr
array
carry
(
array
result
)
const
{
u
in
t32_
t
overflow
=
0
;
for
(
std
::
ptr
diff_t
i
=
result
.
size
()
-
1
;
i
>
0
;
i
--
)
in
dex_in
t
overflow
=
0
;
for
(
diff_
in
t
i
=
result
.
size
()
-
1
;
i
>
0
;
i
--
)
{
auto
z
=
result
[
i
]
+
overflow
;
// Reset overflow
...
...
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
View file @
2f268bc2
...
...
@@ -42,6 +42,32 @@ struct print_buffer
pos
++
;
}
}
template
<
class
T
,
class
=
decltype
(
T
{}
%
10
,
-
T
{}
)>
constexpr
void
append
(
T
i
)
{
if
(
i
<
0
)
{
append
(
'-'
);
i
=
-
i
;
}
char
c
=
(
i
%
10
)
+
'0'
;
if
(
i
>
9
)
append
(
i
/
10
);
append
(
c
);
}
constexpr
void
append
(
const
char
*
str
)
{
if
(
str
==
nullptr
)
return
;
int
i
=
512
;
while
(
*
str
!=
0
and
i
>
0
)
{
append
(
*
str
);
str
++
;
i
--
;
}
}
template
<
size_t
M
>
constexpr
void
append
(
const
char
(
&
array
)[
M
])
...
...
@@ -54,14 +80,36 @@ struct print_buffer
template
<
class
...
Ts
>
__host__
__device__
void
print
(
const
Ts
&
...
xs
)
{
const
auto
size
=
(
sizeof
(
xs
)
+
...);
print_buffer
<
size
>
buffer
;
print_buffer
<
1024
>
buffer
;
swallow
{(
buffer
.
append
(
xs
),
0
)...};
printf
(
"%s"
,
buffer
.
buffer
);
}
}
// namespace debug
struct
source_location
{
int
line
=
__builtin_LINE
();
const
char
*
file
=
__builtin_FILE
();
const
char
*
function
=
__builtin_FUNCTION
();
};
template
<
class
T
>
struct
source_location_capture
{
T
x
;
source_location
loc
;
template
<
class
U
,
class
=
decltype
(
T
(
U
{}
))>
constexpr
source_location_capture
(
U
px
,
source_location
ploc
=
source_location
{})
:
x
(
px
),
loc
(
ploc
)
{
}
constexpr
operator
source_location
()
const
{
return
loc
;
}
constexpr
operator
T
()
const
{
return
x
;
}
};
// noreturn cannot be used on this function because abort in hip is broken
template
<
class
T1
,
class
T2
,
class
T3
,
class
T4
>
MIGRAPHX_HIP_NORETURN
inline
__host__
__device__
void
...
...
@@ -73,13 +121,38 @@ assert_fail(const T1& assertion, const T2& file, const T3& line, const T4& funct
abort
();
}
#ifdef MIGRAPHX_DEBUG
#define MIGRAPHX_ASSERT(cond) \
template
<
class
...
Ts
>
MIGRAPHX_HIP_NORETURN
inline
__host__
__device__
void
assert_fail
(
const
source_location
&
loc
,
Ts
...
xs
)
{
debug
::
print
(
loc
.
file
,
":"
,
loc
.
line
,
": "
,
loc
.
function
,
": error: "
,
xs
...,
"
\n
"
);
abort
();
}
// NOLINTNEXTLINE
#define MIGRAPHX_ASSERT_FAIL(cond, ...) \
((cond) ? void(0) : [](auto&&... private_migraphx_xs) { \
assert_fail(private_migraphx_xs...); \
}(#cond, __FILE__, MIGRAPHX_STRINGIZE(__LINE__), __PRETTY_FUNCTION__))
}(__VA_ARGS__))
// NOLINTNEXTLINE
#define MIGRAPHX_CHECK(cond) \
MIGRAPHX_ASSERT_FAIL(cond, #cond, __FILE__, __LINE__, __PRETTY_FUNCTION__)
#ifdef MIGRAPHX_DEBUG
// NOLINTNEXTLINE
#define MIGRAPHX_CAPTURE_SOURCE_LOCATION(T) source_location_capture<T>
#define MIGRAPHX_WARN(cond, loc, ...) MIGRAPHX_ASSERT_FAIL(cond, loc, __VA_ARGS__)
#define MIGRAPHX_ASSERT MIGRAPHX_CHECK
#define MIGRAPHX_ASSUME MIGRAPHX_CHECK
#define MIGRAPHX_UNREACHABLE() MIGRAPHX_ASSERT(false)
#else
// NOLINTNEXTLINE
#define MIGRAPHX_CAPTURE_SOURCE_LOCATION(T) T
#define MIGRAPHX_ASSUME __builtin_assume
#define MIGRAPHX_UNREACHABLE __builtin_unreachable
#define MIGRAPHX_ASSERT(cond)
#define MIGRAPHX_WARN(...)
#endif
}
// namespace migraphx
...
...
src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
0 → 100644
View file @
2f268bc2
#ifndef MIGRAPHX_GUARD_KERNELS_DPP_HPP
#define MIGRAPHX_GUARD_KERNELS_DPP_HPP
#include <migraphx/kernels/hip.hpp>
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/debug.hpp>
namespace
migraphx
{
#ifndef MIGRAPHX_HAS_DPP
#define MIGRAPHX_HAS_DPP 1
#endif
#if MIGRAPHX_HAS_DPP
constexpr
unsigned
int
dpp_row_shr
(
unsigned
int
x
)
{
return
0x110u
|
x
;
}
constexpr
unsigned
int
dpp_row_bcast
(
unsigned
int
x
)
{
unsigned
int
y
=
0
;
switch
(
x
)
{
case
15
:
y
=
0x142
;
break
;
case
31
:
y
=
0x143
;
break
;
default:
MIGRAPHX_UNREACHABLE
();
}
return
y
;
}
template
<
unsigned
int
DppCtrl
,
unsigned
int
RowMask
=
0xf
,
unsigned
int
BankMask
=
0xf
,
bool
BoundCtrl
=
false
,
class
T
>
__device__
T
dpp_mov
(
T
&
x
)
{
static
const
index_int
n
=
sizeof
(
T
)
<
4
?
1
:
sizeof
(
T
)
/
4
;
union
type
{
uint32_t
reg
[
n
];
T
data
;
};
type
output
{};
type
input
{};
// cppcheck-suppress unreadVariable
input
.
data
=
x
;
for
(
index_int
i
=
0
;
i
<
n
;
i
++
)
{
output
.
reg
[
i
]
=
__hip_move_dpp
(
input
.
reg
[
i
],
DppCtrl
,
RowMask
,
BankMask
,
BoundCtrl
);
}
return
output
.
data
;
}
#endif
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_DPP_HPP
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
View file @
2f268bc2
...
...
@@ -3,6 +3,14 @@
#include <migraphx/kernels/array.hpp>
// NOLINTNEXTLINE
#define MIGRAPHX_RETURNS(...) \
->decltype(__VA_ARGS__) { return __VA_ARGS__; }
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT(...) \
[](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
namespace
migraphx
{
struct
swallow
...
...
@@ -129,7 +137,7 @@ constexpr auto by(F f)
template
<
class
F
,
class
...
Ts
>
constexpr
void
each_args
(
F
f
,
Ts
&&
...
xs
)
{
swallow
{(
f
(
st
d
::
forward
<
Ts
>
(
xs
)),
0
)...};
swallow
{(
f
(
st
atic_cast
<
Ts
&&
>
(
xs
)),
0
)...};
}
template
<
class
F
>
...
...
@@ -161,6 +169,18 @@ constexpr auto pack(Ts... xs)
return
[
=
](
auto
f
)
{
return
f
(
xs
...);
};
}
template
<
class
G
,
class
F
>
constexpr
auto
join
(
G
g
,
F
f
)
{
return
f
([
=
](
auto
...
xs
)
{
return
g
(
xs
...);
});
}
template
<
class
G
,
class
F
,
class
...
Fs
>
constexpr
auto
join
(
G
g
,
F
f
,
Fs
...
fs
)
{
return
f
([
=
](
auto
...
xs
)
{
return
join
([
=
](
auto
...
ys
)
{
return
g
(
xs
...,
ys
...);
},
fs
...);
});
}
template
<
class
Compare
,
class
P1
,
class
P2
>
constexpr
auto
pack_compare
(
Compare
compare
,
P1
p1
,
P2
p2
)
{
...
...
@@ -191,39 +211,45 @@ constexpr auto arg(IntegralConstant ic)
return
arg_c
<
ic
>
();
}
inline
constexpr
auto
rotate_last
()
template
<
class
F
>
constexpr
auto
make_transform
(
F
f
)
{
return
[](
auto
...
xs
)
{
return
[
=
](
auto
&&
f
)
{
return
sequence_c
<
sizeof
...(
xs
)
>
([
&
](
auto
...
is
)
{
constexpr
auto
size
=
sizeof
...(
is
);
return
f
(
arg_c
<
(
is
+
size
-
1
)
%
size
>
()(
xs
...)...);
});
};
};
return
[
=
](
auto
...
xs
)
{
return
[
=
](
auto
g
)
{
return
f
(
g
,
xs
...);
};
};
}
// An arg transformation takes the arguments and then a function to take the new arguments:
// transform(xs...)([](auto... ys) { ... })
// The transform_args function takes a list of transformations and continually applies them
template
<
class
F
>
constexpr
auto
transform_args
(
F
f
)
{
return
[
=
](
auto
...
xs
)
{
return
[
=
](
auto
g
)
{
return
f
(
xs
...)([
&
](
auto
...
ys
)
{
return
g
(
ys
...);
});
};
};
return
f
;
}
template
<
class
F
,
class
...
Fs
>
constexpr
auto
transform_args
(
F
f
,
Fs
...
fs
)
{
return
[
=
](
auto
...
xs
)
{
return
transform_args
(
f
)(
xs
...)(
transform_args
(
fs
...));
};
return
make_transform
([
=
](
auto
g
,
auto
...
xs
)
{
return
f
(
xs
...)([
=
](
auto
...
ys
)
{
return
transform_args
(
fs
...)(
ys
...)(
g
);
});
});
}
// NOLINTNEXTLINE
#define MIGRAPHX_RETURNS(...) \
->decltype(__VA_ARGS__) { return __VA_ARGS__; }
// identity transform
inline
constexpr
auto
transform_args
()
{
return
make_transform
([](
auto
f
,
auto
...
xs
)
{
return
f
(
xs
...);
});
}
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT(...) \
[](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
// Rotate the first argument to the last argument
inline
constexpr
auto
rotate_last
()
{
return
make_transform
([](
auto
f
,
auto
...
xs
)
{
return
sequence_c
<
sizeof
...(
xs
)
>
([
&
](
auto
...
is
)
{
constexpr
auto
size
=
sizeof
...(
is
);
return
f
(
arg_c
<
(
is
+
size
-
1
)
%
size
>
()(
xs
...)...);
});
});
}
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
0 → 100644
View file @
2f268bc2
#ifndef MIGRAPHX_GUARD_KERNELS_GATHERND_HPP
#define MIGRAPHX_GUARD_KERNELS_GATHERND_HPP
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/algorithm.hpp>
namespace
migraphx
{
template
<
class
T
>
struct
gathernd_settings
{
T
batch_dims
{};
};
template
<
class
...
Ts
>
constexpr
gathernd_settings
<
Ts
...
>
make_gathernd_settings
(
Ts
...
xs
)
{
return
{
xs
...};
}
template
<
class
T
,
class
U
,
class
V
,
class
Settings
>
__device__
void
gathernd
(
const
T
&
data_t
,
const
U
&
indices_t
,
const
V
&
output_t
,
Settings
s
)
{
auto
ind
=
make_index
();
auto
batch_dims
=
s
.
batch_dims
;
auto
output_shape
=
output_t
.
get_shape
();
auto
indices_shape
=
indices_t
.
get_shape
();
auto
data_shape
=
data_t
.
get_shape
();
auto
indices_shape_lens
=
indices_shape
.
lens
;
auto
data_shape_lens
=
data_shape
.
lens
;
auto
num_slice_dims
=
indices_shape_lens
.
back
();
std
::
size_t
num_slices
=
accumulate
(
indices_shape_lens
.
begin
(),
indices_shape_lens
.
end
()
-
1
,
1
,
std
::
multiplies
<
std
::
size_t
>
());
std
::
size_t
slice_size
=
accumulate
(
data_shape_lens
.
begin
()
+
num_slice_dims
+
batch_dims
,
data_shape_lens
.
end
(),
1
,
std
::
multiplies
<
std
::
size_t
>
());
const
std
::
size_t
num_batches
=
accumulate
(
data_shape_lens
.
begin
(),
data_shape_lens
.
begin
()
+
batch_dims
,
1
,
std
::
multiplies
<
std
::
size_t
>
());
const
std
::
size_t
data_batch_stride
=
accumulate
(
data_shape_lens
.
begin
()
+
batch_dims
,
data_shape_lens
.
end
(),
1
,
std
::
multiplies
<
std
::
size_t
>
());
const
auto
num_slices_per_batch
=
num_slices
/
num_batches
;
ind
.
global_stride
(
output_shape
.
elements
(),
[
&
](
auto
i
)
{
const
auto
*
indices_ptr
=
indices_t
.
data
();
const
std
::
size_t
j
=
i
/
slice_size
;
const
std
::
size_t
batch_idx
=
j
/
num_slices_per_batch
;
auto
*
slice_indices
=
indices_ptr
+
(
j
*
num_slice_dims
);
std
::
size_t
relative_slice_offset
=
0
;
for
(
std
::
size_t
idx
=
0
;
idx
<
num_slice_dims
;
++
idx
)
{
int64_t
index
=
slice_indices
[
idx
];
const
std
::
size_t
input_dim_idx
=
batch_dims
+
idx
;
const
auto
input_dim
=
data_shape_lens
[
input_dim_idx
];
assert
(
index
>=
-
static_cast
<
int64_t
>
(
input_dim
)
and
index
<
static_cast
<
int64_t
>
(
input_dim
));
if
(
index
<
0
)
index
+=
input_dim
;
std
::
size_t
size_from_slice_dims
=
accumulate
(
data_shape_lens
.
begin
()
+
batch_dims
+
idx
+
1
,
data_shape_lens
.
begin
()
+
batch_dims
+
num_slice_dims
,
slice_size
,
std
::
multiplies
<
std
::
size_t
>
());
relative_slice_offset
+=
index
*
size_from_slice_dims
;
}
auto
slice_offset
=
(
batch_idx
*
data_batch_stride
)
+
relative_slice_offset
;
output_t
[
i
]
=
data_t
[
slice_offset
+
i
%
slice_size
];
});
}
}
// namespace migraphx
#endif
src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
View file @
2f268bc2
...
...
@@ -3,6 +3,7 @@
#include <migraphx/kernels/hip.hpp>
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/integral_constant.hpp>
namespace
migraphx
{
...
...
@@ -12,23 +13,23 @@ struct index
index_int
local
=
0
;
index_int
group
=
0
;
__device__
index_int
nglobal
()
const
{
#ifdef MIGRAPHX_NGLOBAL
return
MIGRAPHX_NGLOBAL
;
constexpr
index_constant
<
MIGRAPHX_NGLOBAL
>
nglobal
()
const
{
return
{};
}
#else
__device__
index_int
nglobal
()
const
{
return
blockDim
.
x
*
gridDim
.
x
;
// NOLINT
#endif
}
#endif
__device__
index_int
nlocal
()
const
{
#ifdef MIGRAPHX_NLOCAL
return
MIGRAPHX_NLOCAL
;
constexpr
index_constant
<
MIGRAPHX_NLOCAL
>
nlocal
()
const
{
return
{};
}
#else
return
blockDim
.
x
;
// NOLINT
#endif
__device__
index_int
nlocal
()
const
{
return
blockDim
.
x
;
// NOLINT
}
#endif
template
<
class
F
>
__device__
void
global_stride
(
index_int
n
,
F
f
)
const
...
...
src/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
View file @
2f268bc2
...
...
@@ -13,7 +13,7 @@ struct basic_iota_iterator
F
f
;
using
difference_type
=
diff_int
;
using
reference
=
decltype
(
f
(
std
::
declval
<
Iterator
>
()));
using
reference
=
decltype
(
f
(
declval
<
Iterator
>
()));
using
value_type
=
remove_reference_t
<
reference
>
;
using
pointer
=
add_pointer_t
<
value_type
>
;
...
...
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
View file @
2f268bc2
...
...
@@ -40,12 +40,31 @@ constexpr T as_float(T x)
return fname(x, xs...); \
}
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_MATH_BINARY_FOR(type, name, fname) \
inline auto __device__ name(type x, type y)->type { return fname(x, y); }
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_MATH_HALF(name, fname) \
template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
auto __device__ name(migraphx::half x, Ts... xs) \
MIGRAPHX_RETURNS(fname(math::as_float(x), math::as_float(xs)...))
// Template with two overloads for math functions, one for half2 type and one for more generic
// <half, N> vectorization where N is 4 or another even number.
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_MATH_HALF2(name, fname) \
template <class... Ts> \
auto __device__ name(migraphx::vec<migraphx::half, 2> x, Ts... xs) \
MIGRAPHX_RETURNS(migraphx::vec<migraphx::half, 2>{fname(x, xs...)}); \
template <class... Ts, index_int N, MIGRAPHX_REQUIRES(N % 2 == 0 && (N > 2))> \
auto __device__ name(migraphx::vec<migraphx::half, N> x, Ts... xs) \
{ \
return vec_packed_transform<2>(x, xs...)( \
[](auto... ys) -> migraphx::vec<migraphx::half, 2> { return fname(ys...); }); \
}
MIGRAPHX_DEVICE_MATH
(
abs
,
::
abs
)
MIGRAPHX_DEVICE_MATH
(
acos
,
::
acos
)
MIGRAPHX_DEVICE_MATH
(
acosh
,
::
acosh
)
...
...
@@ -112,12 +131,65 @@ MIGRAPHX_DEVICE_MATH_HALF(sinh, ::sinh)
MIGRAPHX_DEVICE_MATH_HALF
(
tan
,
::
tan
)
MIGRAPHX_DEVICE_MATH_HALF
(
tanh
,
::
tanh
)
// Map math functions to hip half2 functions
// The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
// packed into a 32-bit number. See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
// Most but not all of these math ops have operators of the same names. Ones not yet implemented
// at this time are: exp2, exp10, log2, log10, isinf
MIGRAPHX_DEVICE_MATH_HALF2
(
abs
,
::
__habs2
)
MIGRAPHX_DEVICE_MATH_HALF2
(
ceil
,
::
h2ceil
)
MIGRAPHX_DEVICE_MATH_HALF2
(
floor
,
::
h2floor
)
MIGRAPHX_DEVICE_MATH_HALF2
(
sin
,
::
h2sin
)
MIGRAPHX_DEVICE_MATH_HALF2
(
cos
,
::
h2cos
)
MIGRAPHX_DEVICE_MATH_HALF2
(
exp
,
::
h2exp
)
MIGRAPHX_DEVICE_MATH_HALF2
(
exp2
,
::
h2exp2
)
MIGRAPHX_DEVICE_MATH_HALF2
(
exp10
,
::
h2exp10
)
MIGRAPHX_DEVICE_MATH_HALF2
(
log2
,
::
h2log2
)
MIGRAPHX_DEVICE_MATH_HALF2
(
log
,
::
h2log
)
MIGRAPHX_DEVICE_MATH_HALF2
(
log10
,
::
h2log10
)
MIGRAPHX_DEVICE_MATH_HALF2
(
rsqrt
,
::
h2rsqrt
)
MIGRAPHX_DEVICE_MATH_HALF2
(
sqrt
,
::
h2sqrt
)
MIGRAPHX_DEVICE_MATH_HALF2
(
isinf
,
::
__hisinf2
)
MIGRAPHX_DEVICE_MATH_HALF2
(
isnan
,
::
__hisnan2
)
template
<
class
T
,
class
U
>
constexpr
auto
where
(
bool
cond
,
const
T
&
a
,
const
U
&
b
)
{
return
cond
?
a
:
b
;
}
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
float
,
max
,
::
max
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
float
,
min
,
::
min
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
double
,
max
,
::
max
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
double
,
min
,
::
min
)
// Add overloads for half that calls the float version
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
migraphx
::
half
,
max
,
::
fmaxf
)
MIGRAPHX_DEVICE_MATH_BINARY_FOR
(
migraphx
::
half
,
min
,
::
fminf
)
template
<
class
T
,
MIGRAPHX_REQUIRES
(
not
is_any_vec
<
T
>())
>
constexpr
auto
max
(
const
T
&
a
,
const
T
&
b
)
{
return
where
(
a
<
b
,
b
,
a
);
}
template
<
class
T
,
MIGRAPHX_REQUIRES
(
not
is_any_vec
<
T
>())
>
constexpr
auto
min
(
const
T
&
a
,
const
T
&
b
)
{
return
where
(
a
<
b
,
a
,
b
);
}
template
<
class
T
,
class
U
,
MIGRAPHX_REQUIRES
(
not
is_same
<
T
,
U
>{}
and
not
is_any_vec
<
T
,
U
>
())
>
constexpr
auto
max
(
const
T
&
a
,
const
U
&
b
)
{
return
max
<
common_type_t
<
T
,
U
>>
(
a
,
b
);
}
template
<
class
T
,
class
U
,
MIGRAPHX_REQUIRES
(
not
is_same
<
T
,
U
>{}
and
not
is_any_vec
<
T
,
U
>
())
>
constexpr
auto
min
(
const
T
&
a
,
const
U
&
b
)
{
return
min
<
common_type_t
<
T
,
U
>>
(
a
,
b
);
}
MIGRAPHX_DEVICE_MATH_VEC
(
abs
)
MIGRAPHX_DEVICE_MATH_VEC
(
acos
)
MIGRAPHX_DEVICE_MATH_VEC
(
acosh
)
...
...
@@ -133,6 +205,8 @@ MIGRAPHX_DEVICE_MATH_VEC(exp)
MIGRAPHX_DEVICE_MATH_VEC
(
floor
)
MIGRAPHX_DEVICE_MATH_VEC
(
isnan
)
MIGRAPHX_DEVICE_MATH_VEC
(
log
)
MIGRAPHX_DEVICE_MATH_VEC
(
max
)
MIGRAPHX_DEVICE_MATH_VEC
(
min
)
MIGRAPHX_DEVICE_MATH_VEC
(
pow
)
MIGRAPHX_DEVICE_MATH_VEC
(
round
)
MIGRAPHX_DEVICE_MATH_VEC
(
rsqrt
)
...
...
@@ -143,18 +217,6 @@ MIGRAPHX_DEVICE_MATH_VEC(tan)
MIGRAPHX_DEVICE_MATH_VEC
(
tanh
)
MIGRAPHX_DEVICE_MATH_VEC
(
where
)
template
<
class
T
,
class
U
>
constexpr
auto
max
(
const
T
&
a
,
const
U
&
b
)
{
return
where
(
a
<
b
,
b
,
a
);
}
template
<
class
T
,
class
U
>
constexpr
auto
min
(
const
T
&
a
,
const
U
&
b
)
{
return
where
(
a
>
b
,
b
,
a
);
}
template
<
class
T
,
class
U
>
constexpr
auto
convert
(
U
v
)
{
...
...
src/targets/gpu/kernels/include/migraphx/kernels/
basic_
ops.hpp
→
src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
100755 → 100644
View file @
2f268bc2
#ifndef MIGRAPHX_GUARD_
AMDMIGRAPHX_
KERNELS_
BASIC_
OPS_HPP
#define MIGRAPHX_GUARD_
AMDMIGRAPHX_
KERNELS_
BASIC_
OPS_HPP
#ifndef MIGRAPHX_GUARD_KERNELS_OPS_HPP
#define MIGRAPHX_GUARD_KERNELS_OPS_HPP
#include <migraphx/kernels/
types
.hpp>
#include <migraphx/kernels/
math
.hpp>
namespace
migraphx
{
namespace
op
{
struct
sum
{
template
<
class
T
,
class
U
>
constexpr
auto
operator
()(
T
x
,
U
y
)
const
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
,
U
y
)
const
{
return
x
+
y
;
}
...
...
@@ -17,7 +18,7 @@ struct sum
struct
product
{
template
<
class
T
,
class
U
>
constexpr
auto
operator
()(
T
x
,
U
y
)
const
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
,
U
y
)
const
{
return
x
*
y
;
}
...
...
@@ -26,7 +27,7 @@ struct product
struct
id
{
template
<
class
T
>
constexpr
auto
operator
()(
T
x
)
const
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
)
const
{
return
x
;
}
...
...
@@ -34,40 +35,39 @@ struct id
struct
mean
{
size_
t
item_num
=
1
;
index_in
t
item_num
=
1
;
template
<
class
T
>
constexpr
auto
operator
()(
T
x
)
const
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
)
const
{
return
x
/
static_cast
<
T
>
(
item_num
);
}
};
struct
max
_f
struct
max
{
template
<
class
T
,
class
U
>
constexpr
auto
operator
()(
T
x
,
U
y
)
const
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
,
U
y
)
const
{
return
(
x
>
y
)
?
x
:
y
;
return
migraphx
::
max
(
x
,
y
)
;
}
};
inline
constexpr
auto
max
=
max_f
{};
struct
min
_f
struct
min
{
template
<
class
T
,
class
U
>
constexpr
auto
operator
()(
T
x
,
U
y
)
const
MIGRAPHX_DEVICE_CONSTEXPR
auto
operator
()(
T
x
,
U
y
)
const
{
return
(
x
<
y
)
?
x
:
y
;
return
migraphx
::
min
(
x
,
y
)
;
}
};
inline
constexpr
auto
min
=
min_f
{};
}
// namespace op
struct
lowest
{
template
<
class
T
>
constexpr
operator
T
()
const
{
return
std
::
numeric_
limits
<
T
>::
lowest
();
return
numeric_lowest
<
T
>
();
}
};
...
...
@@ -76,9 +76,8 @@ struct highest
template
<
class
T
>
constexpr
operator
T
()
const
{
return
std
::
numeric_
limits
<
T
>::
max
();
return
numeric_max
<
T
>
();
}
};
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_
AMDMIGRAPHX_
KERNELS_
BASIC_
OPS_HPP
#endif // MIGRAPHX_GUARD_KERNELS_OPS_HPP
src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
View file @
2f268bc2
...
...
@@ -38,20 +38,17 @@ constexpr implicit_conversion_op<T> implicit_conversion(T x)
template
<
class
F
,
class
T
,
class
...
Ts
>
__device__
void
pointwise_tensor
(
index
idx
,
F
f
,
T
out
,
Ts
...
xs
)
{
preload
<
typename
T
::
type
>
(
idx
,
xs
...)([
&
](
auto
...
ps
)
{
idx
.
global_stride
(
out
.
get_shape
().
elements
(),
[
&
](
auto
i
)
{
out
[
i
]
=
implicit_conversion
(
f
(
ps
[
i
]...));
});
});
idx
.
global_stride
(
out
.
get_shape
().
elements
(),
[
&
](
auto
i
)
{
out
[
i
]
=
implicit_conversion
(
f
(
xs
[
i
]...));
});
}
template
<
class
F
,
class
...
T
s
>
__device__
void
pointwise
(
F
f
,
Ts
*
...
p
s
)
template
<
class
...
Transform
s
>
__device__
auto
pointwise
(
index
idx
,
Transforms
...
transform
s
)
{
auto
t
=
transform_args
(
make_tensors
(),
rotate_last
(),
auto_vectorize
());
t
(
ps
...)([
&
](
auto
...
xs
)
{
auto
idx
=
make_index
();
pointwise_tensor
(
idx
,
f
,
xs
...);
});
return
[
=
](
auto
f
,
auto
*
...
ps
)
{
auto
t
=
transform_args
(
make_tensors
(),
rotate_last
(),
transforms
...);
t
(
ps
...)([
&
](
auto
...
xs
)
{
pointwise_tensor
(
idx
,
f
,
xs
...);
});
};
}
}
// namespace migraphx
...
...
src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
View file @
2f268bc2
...
...
@@ -3,6 +3,8 @@
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/vec.hpp>
namespace
migraphx
{
...
...
@@ -73,7 +75,7 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
{
if
constexpr
(
decltype
(
tensor_vec_size
(
x
)){}
==
0
)
{
auto
v
=
vectorize
(
x
);
auto
v
=
auto_
vectorize
(
x
);
auto
b
=
as_vec
(
tensor_vec_size
(
v
),
buffer
+
offset
);
idx
.
local_stride
(
v
.
get_shape
().
element_space
(),
[
&
](
auto
i
)
{
b
[
i
]
=
v
.
data
()[
i
];
});
...
...
@@ -126,5 +128,47 @@ __device__ auto preload(index idx, Ts... xs)
};
}
inline
__device__
auto
auto_preload
(
index
idx
)
{
return
make_transform
([
=
](
auto
f
,
auto
out
,
auto
...
xs
)
{
preload
<
typename
decltype
(
out
)
::
type
>
(
idx
,
xs
...)([
&
](
auto
...
ys
)
{
f
(
out
,
ys
...);
});
});
}
template
<
bool
B
,
class
T
>
__device__
auto
preload_copy
(
index
idx
,
T
x
)
{
return
[
=
](
auto
f
)
{
if
constexpr
(
B
)
{
using
type
=
typename
T
::
type
;
constexpr
auto
size
=
get_shape_c
<
T
>
{}.
element_space
();
__shared__
type
buffer
[
size
];
// TODO: Always vecotrize when size > 4, and then use a second loop for remainder
constexpr
auto
n
=
find_vectorize_size
([
&
](
auto
i
)
{
return
(
size
%
i
)
==
0
;
});
auto
input
=
as_vec
<
n
>
(
remove_bool
(
x
.
data
()));
auto
b
=
as_vec
<
n
>
(
remove_bool
(
buffer
));
idx
.
local_stride
(
size
/
n
,
[
&
](
auto
i
)
{
b
[
i
]
=
input
[
i
];
});
return
f
(
x
.
with
(
buffer
));
}
else
{
return
f
(
x
);
}
};
}
template
<
bool
...
Bs
>
__device__
auto
auto_preload
(
index
idx
)
{
return
make_transform
([
=
](
auto
f
,
auto
...
xs
)
{
auto
invoke
=
[
=
](
auto
...
ys
)
{
__syncthreads
();
f
(
ys
...);
};
join
(
invoke
,
preload_copy
<
Bs
>
(
idx
,
xs
)...);
});
}
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
View file @
2f268bc2
...
...
@@ -140,6 +140,10 @@ struct basic_printer
{
return
print_ulong
(
value
);
}
__host__
__device__
const
basic_printer
&
operator
<<
(
migraphx
::
half
value
)
const
{
return
print_double
(
value
);
}
__host__
__device__
const
basic_printer
&
operator
<<
(
float
value
)
const
{
return
print_double
(
value
);
...
...
src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
0 → 100644
View file @
2f268bc2
#ifndef MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
#define MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
#include <migraphx/kernels/dpp.hpp>
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/tensor_view.hpp>
#include <migraphx/kernels/ops.hpp>
namespace
migraphx
{
#if MIGRAPHX_HAS_DPP
template
<
class
T
,
class
Op
>
__device__
void
dpp_reduce
(
T
&
in
,
Op
op
)
{
T
out
{};
out
=
dpp_mov
<
dpp_row_shr
(
1
)
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_shr
(
2
)
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_shr
(
4
),
0xf
,
0xe
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_shr
(
8
),
0xf
,
0xc
>
(
in
);
in
=
op
(
in
,
out
);
#if __AMDGCN_WAVEFRONT_SIZE == 64
out
=
dpp_mov
<
dpp_row_bcast
(
15
),
0xa
>
(
in
);
in
=
op
(
in
,
out
);
out
=
dpp_mov
<
dpp_row_bcast
(
31
),
0xc
>
(
in
);
in
=
op
(
in
,
out
);
#endif
}
#if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
// NOLINTNEXTLINE
#define MIGRAPHX_DPP_REDUCE_ASM(x, ins) x = 1
#elif __AMDGCN_WAVEFRONT_SIZE == 64
#define MIGRAPHX_DPP_REDUCE_ASM(x, ins) \
__asm__ volatile("s_nop 4\n" #ins " %0 %0 %0 row_shr:1\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:2\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:8 bank_mask:0xc\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_bcast:15 row_mask:0xa\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_bcast:31 row_mask:0xc\n" \
"s_nop 1\n" \
: "=v"(x) \
: "0"(x))
#else
#define MIGRAPHX_DPP_REDUCE_ASM(x, ins) \
__asm__ volatile("s_nop 4\n" #ins " %0 %0 %0 row_shr:1\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:2\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n" \
"s_nop 1\n" #ins " %0 %0 %0 row_shr:8 bank_mask:0xc\n" \
"s_nop 1\n" \
"s_nop 1\n" \
: "=v"(x) \
: "0"(x))
#endif
// NOLINTNEXTLINE
#define MIGRAPHX_DPP_REDUCE(op, prefix) \
__device__ inline void dpp_reduce(double& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64); } \
__device__ inline void dpp_reduce(float& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32); } \
__device__ inline void dpp_reduce(half& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16); } \
__device__ inline void dpp_reduce(int32_t& x, op) \
{ \
MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); \
} \
__device__ inline void dpp_reduce(uint32_t& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); }
MIGRAPHX_DPP_REDUCE
(
op
::
sum
,
v_add
)
MIGRAPHX_DPP_REDUCE
(
op
::
max
,
v_max
)
MIGRAPHX_DPP_REDUCE
(
op
::
min
,
v_min
)
MIGRAPHX_DPP_REDUCE
(
op
::
product
,
v_mul
)
template
<
class
Op
,
class
T
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
index_int
n
,
F
f
)
{
#if __AMDGCN_WAVEFRONT_SIZE == 32
constexpr
index_int
lanes_per_thread
=
16
;
#else
constexpr
index_int
lanes_per_thread
=
64
;
#endif
using
type
=
decltype
(
f
(
0
));
__shared__
type
buffer
[
idx
.
nlocal
()
/
lanes_per_thread
];
type
x
=
init
;
idx
.
local_stride
(
n
,
[
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
dpp_reduce
(
x
,
op
);
const
auto
ldsidx
=
idx
.
local
/
lanes_per_thread
;
if
((
idx
.
local
%
lanes_per_thread
)
==
lanes_per_thread
-
1
)
{
buffer
[
ldsidx
]
=
x
;
}
__syncthreads
();
type
y
=
init
;
for
(
index_int
i
=
0
;
i
<
idx
.
nlocal
()
/
lanes_per_thread
;
i
++
)
{
y
=
op
(
y
,
buffer
[
i
]);
}
return
y
;
}
#else
template
<
class
Op
,
class
T
,
class
F
>
__device__
auto
block_reduce
(
index
idx
,
Op
op
,
T
init
,
index_int
n
,
F
f
)
{
using
type
=
decltype
(
f
(
0
));
__shared__
type
buffer
[
idx
.
nlocal
()];
type
x
=
init
;
idx
.
local_stride
(
n
,
[
&
](
auto
i
)
{
x
=
op
(
x
,
f
(
i
));
});
buffer
[
idx
.
local
]
=
x
;
__syncthreads
();
for
(
index_int
s
=
1
;
s
<
idx
.
nlocal
();
s
*=
2
)
{
const
index_int
index
=
2
*
s
*
idx
.
local
;
if
(
index
+
s
<
idx
.
nlocal
())
{
buffer
[
index
]
=
op
(
buffer
[
index
],
buffer
[
index
+
s
]);
}
__syncthreads
();
}
return
buffer
[
0
];
}
#endif
template
<
class
Output
,
class
Input
,
class
T
>
constexpr
auto
reduce_slice
(
Input
input
,
T
i
)
{
constexpr
auto
lens
=
transform
(
get_shape_c
<
Input
>
{}.
lens
,
get_shape_c
<
Output
>
{}.
lens
,
[](
index_int
x
,
index_int
y
)
->
index_int
{
if
(
x
==
y
)
return
1
;
return
x
;
});
;
constexpr
auto
s
=
make_shape
(
lens
,
get_shape_c
<
Input
>
{}.
strides
);
MIGRAPHX_ASSERT
((
input
.
get_shape
().
index
(
i
)
+
s
.
element_space
())
<=
input
.
get_shape
().
element_space
());
return
make_tensor_view
(
&
input
[
i
],
s
);
}
namespace
reduce
{
template
<
class
Slicer
,
class
F
>
constexpr
auto
sliced
(
Slicer
slicer
,
F
f
)
{
return
[
=
](
auto
x
,
auto
...
xs
)
{
// TODO: assert all elements are the same
return
f
(
slicer
(
x
),
slicer
(
xs
)...);
};
}
struct
block
{
template
<
class
Slicer
>
struct
reducer
{
index
idx
;
Slicer
slicer
;
template
<
class
Op
,
class
T
,
class
Read
>
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
{
return
sliced
(
slicer
,
[
=
](
auto
x
,
auto
...
xs
)
{
return
vec_reduce
(
block_reduce
(
idx
,
op
,
init
,
x
.
get_shape
().
elements
(),
[
&
](
auto
j
)
{
return
read
(
x
[
j
],
xs
[
j
]...);
}),
op
);
});
}
template
<
class
F
>
__device__
void
outer
(
F
f
)
const
{
if
(
idx
.
local
==
0
)
f
();
}
};
template
<
class
Slicer
>
static
__device__
auto
make
(
index
idx
,
Slicer
slicer
)
{
return
reducer
<
Slicer
>
{
idx
,
slicer
};
}
template
<
class
Output
,
class
F
>
static
__device__
void
run
(
F
f
)
{
auto
idx
=
make_index
();
constexpr
auto
nelements
=
get_shape_c
<
Output
>
{}.
elements
();
idx
.
global_stride
(
nelements
*
idx
.
nlocal
(),
[
&
](
auto
i
)
{
const
auto
out_idx
=
get_shape_c
<
Output
>
{}.
multi
(
i
/
idx
.
nlocal
());
f
(
out_idx
,
make
(
idx
,
[
&
](
auto
input
)
{
return
reduce_slice
<
Output
>
(
input
,
out_idx
);
}));
});
}
};
struct
lane
{
template
<
class
Slicer
>
struct
reducer
{
index
idx
;
Slicer
slicer
;
template
<
class
Op
,
class
T
,
class
Read
>
__device__
auto
reduce
(
Op
op
,
T
init
,
Read
read
)
const
{
return
sliced
(
slicer
,
[
=
](
auto
x
,
auto
...
xs
)
{
using
type
=
typename
decltype
(
x
)
::
type
;
type
r
=
init
;
for
(
index_int
j
=
0
;
j
<
x
.
get_shape
().
elements
();
j
++
)
{
r
=
op
(
r
,
read
(
x
[
j
],
xs
[
j
]...));
}
return
r
;
});
}
template
<
class
F
>
__device__
void
outer
(
F
f
)
const
{
f
();
}
};
template
<
class
Slicer
>
static
__device__
auto
make
(
index
idx
,
Slicer
slicer
)
{
return
reducer
<
Slicer
>
{
idx
,
slicer
};
}
template
<
class
Output
,
class
F
>
static
__device__
void
run
(
F
f
)
{
auto
idx
=
make_index
();
constexpr
auto
nelements
=
get_shape_c
<
Output
>
{}.
elements
();
idx
.
global_stride
(
nelements
,
[
&
](
auto
i
)
{
const
auto
out_idx
=
get_shape_c
<
Output
>
{}.
multi
(
i
);
f
(
out_idx
,
make
(
idx
,
[
&
](
auto
input
)
{
return
reduce_slice
<
Output
>
(
input
,
out_idx
);
}));
});
}
};
}
// namespace reduce
template
<
class
Algo
,
class
Op
,
class
T
,
class
Input
,
class
Output
,
class
ReadInput
,
class
WriteOuput
>
__device__
void
simple_reduce
(
Op
op
,
T
init
,
Input
input
,
Output
output
,
ReadInput
read
,
WriteOuput
write
)
{
Algo
::
template
run
<
Output
>([
&
](
auto
out_idx
,
auto
r
)
{
auto
x
=
r
.
reduce
(
op
,
init
,
read
)(
input
);
r
.
outer
([
&
]
{
output
[
out_idx
]
=
write
(
x
);
});
});
}
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
View file @
2f268bc2
...
...
@@ -3,14 +3,15 @@
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/dfor.hpp>
#include <migraphx/kernels/basic_ops.hpp>
#include <migraphx/kernels/ops.hpp>
#include <migraphx/kernels/math.hpp>
#include <migraphx/kernels/array.hpp>
namespace
migraphx
{
struct
max_pool
{
MIGRAPHX_DEVICE_CONSTEXPR
auto
init
()
{
return
lowest
()
;
}
MIGRAPHX_DEVICE_CONSTEXPR
auto
init
()
{
return
lowest
{}
;
}
template
<
class
T
>
MIGRAPHX_DEVICE_CONSTEXPR
T
operator
()(
T
x
,
T
y
)
...
...
@@ -55,7 +56,7 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
return
0
;
}
xy
[
ii
]
=
max
(
xy
[
ii
],
0.0
f
);
xy
[
ii
]
=
migraphx
::
max
(
xy
[
ii
],
0.0
f
);
low
[
ii
]
=
xy
[
ii
];
high
[
ii
]
=
low
[
ii
]
+
1
;
if
(
low
[
ii
]
>=
dims
[
ii
]
-
1
)
...
...
@@ -118,15 +119,13 @@ constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs)
}
template
<
class
T
,
class
U
,
class
V
,
class
W
,
class
Settings
>
__device__
void
roialign
(
const
T
&
x_t
,
const
U
&
rois_t
,
const
V
&
ind_t
,
const
W
&
y_t
,
Settings
s
)
__device__
void
roialign
(
const
T
&
x_t
,
const
U
&
rois_t
,
const
V
&
ind_t
,
W
&
y_t
,
Settings
s
)
{
auto
index
=
make_index
();
const
auto
x
=
x_t
.
begin
();
const
auto
rois
=
rois_t
.
begin
();
const
auto
ind
=
ind_t
.
begin
();
auto
out_ptr
=
y_t
.
begin
();
// input shape
auto
x_lens
=
x_t
.
get_shape
().
lens
;
auto
channel_num
=
x_lens
[
1
];
...
...
@@ -166,35 +165,36 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&
for
(
index_int
ii
=
0
;
ii
<
roi_size
.
size
();
++
ii
)
{
roi_size
[
ii
]
=
roi_ends
[
ii
]
-
roi_starts
[
ii
];
roi_size
[
ii
]
=
max
(
roi_size
[
ii
],
1.0
f
);
roi_size
[
ii
]
=
migraphx
::
max
(
roi_size
[
ii
],
1.0
f
);
bin_size
[
ii
]
=
roi_size
[
ii
]
/
out_dims
[
ii
];
bin_grid_size
[
ii
]
=
(
s
.
sampling_ratio
>
0
)
?
s
.
sampling_ratio
:
std
::
ceil
(
roi_size
[
ii
]
/
out_dims
[
ii
]);
bin_size
[
ii
]
=
roi_size
[
ii
]
/
out_dims
[
ii
];
bin_grid_size
[
ii
]
=
(
s
.
sampling_ratio
>
0
)
?
s
.
sampling_ratio
:
migraphx
::
ceil
(
roi_size
[
ii
]
/
out_dims
[
ii
]);
}
const
auto
offset_x
=
x
+
((
batch_ind
*
channel_num
+
c
)
*
in_dims
[
0
]
*
in_dims
[
1
]);
if
constexpr
(
s
.
is_avg_pooling
)
{
out_ptr
[
i
]
=
calc_pooling
(
offset_x
,
roi_starts
,
bin_size
,
{
ph
,
pw
},
bin_grid_size
,
in_dims
,
s
.
roi_offset
,
avg_pool
{});
y_t
[
i
]
=
calc_pooling
(
offset_x
,
roi_starts
,
bin_size
,
{
ph
,
pw
},
bin_grid_size
,
in_dims
,
s
.
roi_offset
,
avg_pool
{});
}
else
{
out_ptr
[
i
]
=
calc_pooling
(
offset_x
,
roi_starts
,
bin_size
,
{
ph
,
pw
},
bin_grid_size
,
in_dims
,
s
.
roi_offset
,
max_pool
{});
y_t
[
i
]
=
calc_pooling
(
offset_x
,
roi_starts
,
bin_size
,
{
ph
,
pw
},
bin_grid_size
,
in_dims
,
s
.
roi_offset
,
max_pool
{});
}
}
}
...
...
Prev
1
…
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment