Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
7f65a88e
Commit
7f65a88e
authored
Feb 04, 2022
by
Paul
Browse files
Merge branch 'develop' into mlir-c
parents
79bfe69f
b20e3d4d
Changes
66
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
940 additions
and
122 deletions
+940
-122
src/program.cpp
src/program.cpp
+72
-2
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+6
-2
src/targets/gpu/argmax.cpp
src/targets/gpu/argmax.cpp
+1
-1
src/targets/gpu/argmin.cpp
src/targets/gpu/argmin.cpp
+1
-1
src/targets/gpu/compile_hip_code_object.cpp
src/targets/gpu/compile_hip_code_object.cpp
+1
-0
src/targets/gpu/compile_pointwise.cpp
src/targets/gpu/compile_pointwise.cpp
+10
-2
src/targets/gpu/fuse_ops.cpp
src/targets/gpu/fuse_ops.cpp
+178
-5
src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+2
-1
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+57
-6
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
...rgets/gpu/kernels/include/migraphx/kernels/functional.hpp
+38
-2
src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
...pu/kernels/include/migraphx/kernels/integral_constant.hpp
+1
-0
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+56
-14
src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
...argets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+27
-2
src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+23
-7
src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+53
-0
src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
...argets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+137
-77
test/onnx/gen_onnx.py
test/onnx/gen_onnx.py
+246
-0
test/onnx/gen_onnx.pyc
test/onnx/gen_onnx.pyc
+0
-0
test/onnx/greaterorequal_test.onnx
test/onnx/greaterorequal_test.onnx
+16
-0
test/onnx/hardsigmoid_default_test.onnx
test/onnx/hardsigmoid_default_test.onnx
+15
-0
No files found.
src/program.cpp
View file @
7f65a88e
...
...
@@ -180,6 +180,63 @@ void program::finalize()
mm
->
finalize
(
this
->
impl
->
ctx
);
}
template
<
class
T
>
std
::
string
classify
(
T
x
)
{
switch
(
std
::
fpclassify
(
x
))
{
case
FP_INFINITE
:
return
"inf"
;
case
FP_NAN
:
return
"nan"
;
case
FP_NORMAL
:
return
"normal"
;
case
FP_SUBNORMAL
:
return
"subnormal"
;
case
FP_ZERO
:
return
"zero"
;
default:
return
"unknown"
;
}
}
std
::
unordered_set
<
std
::
string
>
classify_argument
(
const
argument
&
a
)
{
std
::
unordered_set
<
std
::
string
>
result
;
a
.
visit
(
[
&
](
auto
t
)
{
for
(
const
auto
&
x
:
t
)
result
.
insert
(
classify
(
x
));
},
[
&
](
const
auto
&
xs
)
{
for
(
const
auto
&
x
:
xs
)
{
auto
r
=
classify_argument
(
x
);
result
.
insert
(
r
.
begin
(),
r
.
end
());
}
});
return
result
;
}
void
preview_argument
(
std
::
ostream
&
os
,
const
argument
&
a
)
{
a
.
visit
(
[
&
](
auto
t
)
{
if
(
t
.
size
()
<=
10
)
{
os
<<
t
;
}
else
{
os
<<
to_string_range
(
t
.
begin
(),
t
.
begin
()
+
5
);
os
<<
", ..., "
;
os
<<
to_string_range
(
t
.
end
()
-
5
,
t
.
end
());
}
},
[
&
](
const
auto
&
xs
)
{
for
(
const
auto
&
x
:
xs
)
{
os
<<
'{'
;
preview_argument
(
os
,
x
);
os
<<
'}'
;
}
});
}
template
<
class
F
>
std
::
vector
<
argument
>
generic_eval
(
const
module
*
mod
,
context
&
ctx
,
...
...
@@ -312,8 +369,21 @@ std::vector<argument> program::eval(parameter_map params) const
if
(
trace_level
>
1
and
ins
->
name
().
front
()
!=
'@'
and
ins
->
name
()
!=
"load"
and
not
result
.
empty
())
{
target
tgt
=
make_target
(
this
->
impl
->
target_name
);
std
::
cout
<<
"Output: "
<<
tgt
.
copy_from
(
result
)
<<
std
::
endl
;
target
tgt
=
make_target
(
this
->
impl
->
target_name
);
auto
buffer
=
tgt
.
copy_from
(
result
);
if
(
trace_level
==
2
)
{
std
::
cout
<<
"Output has "
<<
to_string_range
(
classify_argument
(
buffer
))
<<
std
::
endl
;
std
::
cout
<<
"Output: "
;
preview_argument
(
std
::
cout
,
buffer
);
std
::
cout
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Output: "
<<
buffer
<<
std
::
endl
;
}
}
return
result
;
}));
...
...
src/targets/gpu/CMakeLists.txt
View file @
7f65a88e
...
...
@@ -327,8 +327,12 @@ target_flags(HIP_COMPILER_FLAGS hip::device)
# Remove cuda arch flags
string
(
REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
string
(
REGEX REPLACE --offload-arch=[a-z0-9:+-]+
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
string
(
REPLACE
"$<LINK_LANGUAGE:CXX>"
"1"
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
string
(
REPLACE
"SHELL:"
""
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
# Skip library paths since hip will incorrectly treat it as a source file
string
(
APPEND HIP_COMPILER_FLAGS
" "
)
foreach
(
_unused RANGE 2
)
string
(
REGEX REPLACE
" /[^ ]+
\\
.(a|so) "
" "
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
endforeach
()
message
(
STATUS
"Hip compiler flags:
${
HIP_COMPILER_FLAGS
}
"
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"-DMIGRAPHX_HIP_COMPILER=
${
CMAKE_CXX_COMPILER
}
"
...
...
src/targets/gpu/argmax.cpp
View file @
7f65a88e
...
...
@@ -9,7 +9,7 @@ namespace gpu {
shape
hip_argmax
::
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
check_shapes
{
inputs
,
*
this
}.
has
(
2
)
.
standard
()
;
check_shapes
{
inputs
,
*
this
}.
has
(
2
);
return
op
.
normalize_compute_shape
({
inputs
.
at
(
0
)});
}
...
...
src/targets/gpu/argmin.cpp
View file @
7f65a88e
...
...
@@ -9,7 +9,7 @@ namespace gpu {
shape
hip_argmin
::
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
check_shapes
{
inputs
,
*
this
}.
has
(
2
)
.
standard
()
;
check_shapes
{
inputs
,
*
this
}.
has
(
2
);
return
op
.
normalize_compute_shape
({
inputs
.
at
(
0
)});
}
...
...
src/targets/gpu/compile_hip_code_object.cpp
View file @
7f65a88e
...
...
@@ -114,6 +114,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
options
.
params
+=
" -DMIGRAPHX_NGLOBAL="
+
std
::
to_string
(
options
.
global
);
options
.
params
+=
" -DMIGRAPHX_NLOCAL="
+
std
::
to_string
(
options
.
local
);
options
.
params
+=
" "
+
join_strings
(
compiler_warnings
(),
" "
);
options
.
params
+=
" -ftemplate-backtrace-limit=0"
;
options
.
params
+=
" -Werror"
;
auto
cos
=
compile_hip_src
(
srcs
,
std
::
move
(
options
.
params
),
get_device_name
());
if
(
cos
.
size
()
!=
1
)
...
...
src/targets/gpu/compile_pointwise.cpp
View file @
7f65a88e
...
...
@@ -63,8 +63,16 @@ operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, modu
run_passes
(
m
,
{
eliminate_common_subexpression
{},
dead_code_elimination
{}});
cpp_generator
g
;
g
.
fmap
([](
const
std
::
string
&
fname
)
{
return
"migraphx::"
+
fname
;
});
auto
name
=
g
.
create_function
(
g
.
generate_module
(
m
).
set_attributes
({
"__device__"
}));
return
compile_pointwise
((
ctx
),
inputs
,
"&"
+
name
,
g
.
str
());
g
.
add_point_op
(
"where"
,
"${function:where}(${0}, ${1}, ${2})"
);
g
.
add_point_op
(
"prelu"
,
"${function:where}(${0} < 0, ${0} * ${1}, ${0})"
);
g
.
add_point_op
(
"sign"
,
"${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))"
);
g
.
add_point_op
(
"equal"
,
"migraphx::abs(${0} == ${1})"
);
g
.
add_point_op
(
"less"
,
"migraphx::abs(${0} < ${1})"
);
g
.
add_point_op
(
"greater"
,
"migraphx::abs(${0} > ${1})"
);
g
.
add_point_op
(
"not"
,
"migraphx::abs(not ${0})"
);
auto
name
=
g
.
create_function
(
g
.
generate_module
(
m
).
set_attributes
({
"__device__"
}).
set_generic_types
(
m
));
return
compile_pointwise
((
ctx
),
inputs
,
"MIGRAPHX_LIFT("
+
name
+
")"
,
g
.
str
());
}
}
// namespace gpu
...
...
src/targets/gpu/fuse_ops.cpp
View file @
7f65a88e
...
...
@@ -62,6 +62,8 @@ struct fusion
keep_alive
(
std
::
move
(
t
));
}
bool
empty
()
const
{
return
fp
==
nullptr
;
}
op_t
operator
[](
std
::
size_t
i
)
const
{
assert
(
fp
);
...
...
@@ -125,12 +127,11 @@ struct fusion
return
shape
{
shape
::
int8_type
,
{
ws_size
}};
}
void
compile
(
context
&
ctx
)
bool
compile
(
context
&
ctx
)
{
assert
(
fp
);
auto
status
=
miopenCompileFusionPlan
(
ctx
.
get_stream
().
get_miopen
(),
fp
.
get
());
if
(
status
!=
miopenStatusSuccess
)
MIGRAPHX_THROW
(
"Compiling fusion plan failed"
);
return
miopenCompileFusionPlan
(
ctx
.
get_stream
().
get_miopen
(),
fp
.
get
())
==
miopenStatusSuccess
;
}
argument
execute
(
context
&
ctx
,
...
...
@@ -561,6 +562,117 @@ struct find_mul_add_relu
}
};
struct
miopen_fusion
{
struct
fuse_op_data
{
operation
op
;
float
alpha
=
1
;
float
beta
=
0
;
};
struct
fuse_op
:
fuse_op_data
,
reflect_equality
<
fuse_op
>
,
reflect_stream
<
fuse_op
>
{
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
pack
(
f
(
self
.
op
,
"op"
),
f
(
self
.
alpha
,
"alpha"
),
f
(
self
.
beta
,
"beta"
));
}
};
std
::
vector
<
fuse_op
>
ops
=
{};
fusion
f
=
{};
std
::
function
<
void
(
context
&
,
const
fusion
&
,
const
std
::
vector
<
argument
>&
)
>
execute
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
{
return
pack
(
f
(
self
.
ops
,
"ops"
));
}
value
compile
(
context
&
ctx
,
const
shape
&
,
std
::
vector
<
shape
>
inputs
)
{
// Compensate for allocation
inputs
.
pop_back
();
std
::
size_t
i
=
0
;
f
=
fusion
(
inputs
[
i
]);
i
++
;
std
::
vector
<
std
::
function
<
void
(
const
fused_operator_args
&
,
const
std
::
vector
<
argument
>&
)
>>
invokers
;
for
(
auto
&&
fop
:
ops
)
{
if
(
i
>
inputs
.
size
())
{
f
=
{};
return
{};
}
if
(
fop
.
op
.
name
()
==
"convolution"
)
{
auto
*
mop
=
f
.
create_conv
(
any_cast
<
op
::
convolution
>
(
fop
.
op
),
inputs
[
i
]);
invokers
.
push_back
(
[
=
](
const
fused_operator_args
&
fargs
,
const
std
::
vector
<
argument
>&
args
)
{
miopenSetOpArgsConvForward
(
fargs
.
get
(),
mop
,
&
fop
.
alpha
,
&
fop
.
beta
,
args
[
i
].
implicit
());
});
i
++
;
}
else
if
(
fop
.
op
.
name
()
==
"add"
)
{
auto
*
mop
=
f
.
create_bias
(
inputs
[
i
]);
invokers
.
push_back
(
[
=
](
const
fused_operator_args
&
fargs
,
const
std
::
vector
<
argument
>&
args
)
{
miopenSetOpArgsBiasForward
(
fargs
.
get
(),
mop
,
&
fop
.
alpha
,
&
fop
.
beta
,
args
[
i
].
implicit
());
});
i
++
;
}
else
if
(
fop
.
op
.
name
()
==
"relu"
)
{
auto
*
mop
=
f
.
create_relu
();
invokers
.
push_back
([
=
](
const
fused_operator_args
&
fargs
,
const
std
::
vector
<
argument
>&
)
{
miopenSetOpArgsActivForward
(
fargs
.
get
(),
mop
,
&
fop
.
alpha
,
&
fop
.
beta
,
0
,
0
,
0
);
});
}
else
{
f
=
{};
return
{};
}
}
if
(
not
f
.
compile
(
ctx
))
{
f
=
{};
return
{};
}
execute
=
[
invokers
](
context
&
c
,
const
fusion
&
ff
,
const
std
::
vector
<
argument
>&
args
)
{
auto
fargs
=
make_fused_args
();
for
(
auto
&&
invoker
:
invokers
)
invoker
(
fargs
,
args
);
ff
.
execute
(
c
,
fargs
,
args
.
front
(),
args
.
back
());
};
return
{{
"workspace"
,
f
.
get_workspace
(
ctx
).
bytes
()}};
}
void
finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
inputs
)
{
if
(
not
f
.
empty
())
return
;
auto
v
=
compile
(
ctx
,
output_shape
,
inputs
);
if
(
not
v
.
is_object
())
MIGRAPHX_THROW
(
"Failed to compile fusion plan"
);
}
std
::
string
name
()
const
{
return
"gpu::miopen_fusion"
;
}
shape
compute_shape
(
const
std
::
vector
<
shape
>&
inputs
)
const
{
if
(
ops
.
empty
())
return
{};
// TODO: Check number of arguments
return
ops
.
front
().
op
.
compute_shape
({
inputs
[
0
],
inputs
[
1
]});
}
argument
compute
(
context
&
ctx
,
const
shape
&
,
const
std
::
vector
<
argument
>&
args
)
const
{
execute
(
ctx
,
f
,
args
);
return
args
.
back
();
}
};
struct
miopen_conv_bias
{
op
::
convolution
op
;
...
...
@@ -596,7 +708,8 @@ struct miopen_conv_bias
f
=
fusion
(
inputs
[
0
]);
conv
=
f
.
create_conv
(
op
,
inputs
[
1
]);
bias
=
f
.
create_bias
(
inputs
[
3
]);
f
.
compile
(
ctx
);
if
(
not
f
.
compile
(
ctx
))
MIGRAPHX_THROW
(
"Failed to compile fusion plan"
);
}
shape
get_workspace
(
context
&
ctx
)
{
return
f
.
get_workspace
(
ctx
);
}
...
...
@@ -683,6 +796,25 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
p
.
replace_instruction
(
ins
,
cb
,
input_ins
,
weights_ins
,
old_ws_ins
,
bias_ins
,
alloc_ins
);
}
inline
auto
precompile_name
(
std
::
string
s
)
// NOLINT
{
return
match
::
make_basic_pred_matcher
([
=
](
instruction_ref
ins
)
{
if
(
ins
->
name
()
!=
"gpu::precompile_op"
)
return
false
;
auto
op
=
from_value
<
operation
>
(
ins
->
get_operator
().
to_value
().
at
(
"op"
));
return
(
op
.
name
()
==
s
);
});
}
template
<
class
...
Ms
>
auto
conv_bias_pointwise
(
Ms
...
ms
)
{
return
precompile_name
(
"pointwise"
)(
match
::
either_arg
(
0
,
1
)(
bias_shape
(
match
::
used_once
()).
bind
(
"bias"
),
fusable_conv
(
match
::
used_once
()).
bind
(
"conv"
)),
ms
...);
}
struct
find_conv_bias
{
context
*
ctx
=
nullptr
;
...
...
@@ -709,6 +841,46 @@ struct find_conv_bias_relu
}
};
struct
find_conv_pointwise
{
context
*
ctx
=
nullptr
;
auto
matcher
()
const
{
return
precompile_name
(
"pointwise"
)(
match
::
nargs
(
3
),
match
::
either_arg
(
0
,
1
)(
bias_shape
(
match
::
used_once
()).
bind
(
"bias"
),
fusable_conv
(
match
::
used_once
()).
bind
(
"conv"
)));
}
void
apply
(
module
&
m
,
match
::
matcher_result
r
)
const
{
auto
conv_ins
=
r
.
instructions
[
"conv"
];
auto
bias_ins
=
r
.
instructions
[
"bias"
];
auto
ins
=
r
.
result
;
auto
input_ins
=
conv_ins
->
inputs
().
at
(
0
);
auto
weights_ins
=
conv_ins
->
inputs
().
at
(
1
);
auto
conv_op
=
any_cast
<
miopen_convolution
>
(
conv_ins
->
get_operator
()).
op
;
auto
alloc_ins
=
ins
->
inputs
().
back
();
module_ref
pm
=
ins
->
module_inputs
().
front
();
miopen_fusion
op
{};
op
.
ops
.
push_back
({{
conv_op
}});
for
(
auto
&&
i
:
*
pm
)
{
if
(
i
.
name
()[
0
]
==
'@'
)
continue
;
auto
inputs
=
to_shapes
(
i
.
inputs
());
op
.
ops
.
push_back
({{
i
.
get_operator
()}});
}
std
::
vector
<
instruction_ref
>
inputs
=
{
input_ins
,
weights_ins
,
bias_ins
,
alloc_ins
};
auto
v
=
op
.
compile
(
*
ctx
,
ins
->
get_shape
(),
to_shapes
(
inputs
));
if
(
not
v
.
is_object
())
return
;
m
.
replace_instruction
(
ins
,
op
,
inputs
);
}
};
struct
find_gemm_add
{
auto
matcher
()
const
...
...
@@ -778,6 +950,7 @@ void fuse_ops::apply(module& p) const
match
::
find_matches
(
p
,
find_triadd
{});
match
::
find_matches
(
p
,
find_layernorm
{},
find_conv_pointwise
{
ctx
},
find_conv_bias_relu
{
ctx
},
find_conv_bias
{
ctx
},
find_add_gelu
{},
...
...
src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
View file @
7f65a88e
...
...
@@ -76,8 +76,9 @@ void arg_op(Op op, hipStream_t stream, const argument& result, const argument& a
size_t
batch_item_num
=
batch_lens
[
axis
];
batch_lens
[
axis
]
=
1
;
migraphx
::
shape
batch_shape
{
arg_shape
.
type
(),
batch_lens
};
migraphx
::
shape
std_arg_shape
{
arg_shape
.
type
(),
arg_shape
.
lens
()};
hip_visit_all
(
arg
,
arg_shape
,
batch_shape
)([
&
](
auto
input
,
auto
arg_s
,
auto
batch_s
)
{
hip_visit_all
(
arg
,
std_
arg_shape
,
batch_shape
)([
&
](
auto
input
,
auto
arg_s
,
auto
batch_s
)
{
auto
*
output
=
device_cast
(
result
.
get
<
int64_t
>
().
data
());
using
type
=
device_type
<
std
::
remove_cv_t
<
typename
decltype
(
input
)
::
value_type
>>
;
// use one block for items in one batch.
...
...
src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
100755 → 100644
View file @
7f65a88e
...
...
@@ -5,6 +5,9 @@
namespace
migraphx
{
#define MIGRAPHX_STRINGIZE_1(...) #__VA_ARGS__
#define MIGRAPHX_STRINGIZE(...) MIGRAPHX_STRINGIZE_1(__VA_ARGS__)
// Workaround hip's broken abort on device code
#ifdef __HIP_DEVICE_COMPILE__
// NOLINTNEXTLINE
...
...
@@ -14,19 +17,67 @@ namespace migraphx {
#define MIGRAPHX_HIP_NORETURN [[noreturn]]
#endif
namespace
debug
{
struct
swallow
{
template
<
class
...
Ts
>
constexpr
swallow
(
Ts
&&
...)
{
}
};
template
<
size_t
N
>
struct
print_buffer
{
char
buffer
[
N
+
1
]
=
{
0
};
char
*
pos
=
buffer
;
constexpr
void
append
(
char
c
)
{
if
(
c
==
0
)
return
;
if
(
pos
<
buffer
+
N
)
{
*
pos
=
c
;
pos
++
;
}
}
template
<
size_t
M
>
constexpr
void
append
(
const
char
(
&
array
)[
M
])
{
for
(
int
i
=
0
;
i
<
M
;
i
++
)
append
(
array
[
i
]);
}
};
template
<
class
...
Ts
>
__host__
__device__
void
print
(
const
Ts
&
...
xs
)
{
const
auto
size
=
(
sizeof
(
xs
)
+
...);
print_buffer
<
size
>
buffer
;
swallow
{(
buffer
.
append
(
xs
),
0
)...};
printf
(
"%s"
,
buffer
.
buffer
);
}
}
// namespace debug
// noreturn cannot be used on this function because abort in hip is broken
template
<
class
T1
,
class
T2
,
class
T3
,
class
T4
>
MIGRAPHX_HIP_NORETURN
inline
__host__
__device__
void
assert_fail
(
const
char
*
assertion
,
const
char
*
file
,
unsigned
int
line
,
const
char
*
function
)
assert_fail
(
const
T1
&
assertion
,
const
T2
&
file
,
const
T3
&
line
,
const
T4
&
function
)
{
printf
(
"%s:%u: %s: assertion '%s' failed.
\n
"
,
file
,
line
,
function
,
assertion
);
// printf is broken on hip with more than one argument, so use a simple print functions instead
debug
::
print
(
file
,
":"
,
line
,
": "
,
function
,
": assertion '"
,
assertion
,
"' failed.
\n
"
);
// printf("%s:%s: %s: assertion '%s' failed.\n", file, line, function, assertion);
abort
();
}
#ifdef MIGRAPHX_DEBUG
#define MIGRAPHX_ASSERT(cond) \
((cond) ? void(0) : [](auto... xs) { \
assert_fail(xs...); \
}(#cond, __FILE__, __LINE__, __PRETTY_FUNCTION__))
#define MIGRAPHX_ASSERT(cond)
\
((cond) ? void(0) : [](auto
&&
...
private_migraphx_
xs) { \
assert_fail(
private_migraphx_
xs...);
\
}(#cond, __FILE__,
MIGRAPHX_STRINGIZE(
__LINE__
)
, __PRETTY_FUNCTION__))
#else
#define MIGRAPHX_ASSERT(cond)
#endif
...
...
src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
View file @
7f65a88e
...
...
@@ -137,12 +137,48 @@ constexpr void each_args(F)
{
}
template
<
class
F
,
class
T
>
constexpr
auto
fold_impl
(
F
&&
,
T
&&
x
)
{
return
static_cast
<
T
&&>
(
x
);
}
template
<
class
F
,
class
T
,
class
U
,
class
...
Ts
>
constexpr
auto
fold_impl
(
F
&&
f
,
T
&&
x
,
U
&&
y
,
Ts
&&
...
xs
)
{
return
fold_impl
(
f
,
f
(
static_cast
<
T
&&>
(
x
),
static_cast
<
U
&&>
(
y
)),
static_cast
<
Ts
&&>
(
xs
)...);
}
template
<
class
F
>
constexpr
auto
fold
(
F
f
)
{
return
[
=
](
auto
&&
...
xs
)
{
return
fold_impl
(
f
,
static_cast
<
decltype
(
xs
)
&&>
(
xs
)...);
};
}
template
<
class
...
Ts
>
auto
pack
(
Ts
...
xs
)
constexpr
auto
pack
(
Ts
...
xs
)
{
return
[
=
](
auto
f
)
{
return
f
(
xs
...);
};
}
template
<
class
Compare
,
class
P1
,
class
P2
>
constexpr
auto
pack_compare
(
Compare
compare
,
P1
p1
,
P2
p2
)
{
return
p1
([
&
](
auto
...
xs
)
{
return
p2
([
&
](
auto
...
ys
)
{
auto
c
=
[
&
](
auto
x
,
auto
y
)
->
int
{
if
(
compare
(
x
,
y
))
return
1
;
else
if
(
compare
(
y
,
x
))
return
-
1
;
else
return
0
;
};
return
fold
([](
auto
x
,
auto
y
)
{
return
x
?
x
:
y
;
})(
c
(
xs
,
ys
)...,
0
);
});
});
}
template
<
index_int
N
>
constexpr
auto
arg_c
()
{
...
...
@@ -187,7 +223,7 @@ constexpr auto transform_args(F f, Fs... fs)
// NOLINTNEXTLINE
#define MIGRAPHX_LIFT(...) \
(
[](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
[](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
View file @
7f65a88e
...
...
@@ -13,6 +13,7 @@ struct integral_constant
using
type
=
integral_constant
;
constexpr
operator
value_type
()
const
noexcept
{
return
value
;
}
constexpr
value_type
operator
()()
const
noexcept
{
return
value
;
}
static
constexpr
type
to
()
{
return
{};
}
};
// NOLINTNEXTLINE
...
...
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
View file @
7f65a88e
...
...
@@ -4,6 +4,7 @@
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/vec.hpp>
#include <migraphx/kernels/functional.hpp>
#include <migraphx/kernels/type_traits.hpp>
#include <hip/hip_fp16.h>
#include <hip/math_functions.h>
...
...
@@ -19,19 +20,30 @@ constexpr T as_float(T x)
}
// namespace math
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_MATH(name, fname) \
template <class... Ts
>
\
#define MIGRAPHX_DEVICE_MATH(name, fname)
\
template <class... Ts
, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())>
\
auto __device__ name(Ts... xs) MIGRAPHX_RETURNS(fname(xs...))
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname) \
template <class... Ts> \
auto __device__ name(type x, Ts... xs) MIGRAPHX_RETURNS(fname(x, xs...))
#define MIGRAPHX_DEVICE_MATH_VEC(name) \
template <class... Ts, MIGRAPHX_REQUIRES(is_any_vec<Ts...>())> \
auto __device__ name(Ts... xs) \
{ \
return vec_transform(xs...)([](auto... ys) { return name(ys...); }); \
}
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_MATH_HALF(name, fname) \
template <class... Ts> \
auto __device__ name(migraphx::half x, Ts... xs) \
#define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname) \
template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
auto __device__ name(type x, Ts... xs)->type \
{ \
return fname(x, xs...); \
}
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_MATH_HALF(name, fname) \
template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
auto __device__ name(migraphx::half x, Ts... xs) \
MIGRAPHX_RETURNS(fname(math::as_float(x), math::as_float(xs)...))
MIGRAPHX_DEVICE_MATH
(
abs
,
::
abs
)
...
...
@@ -99,21 +111,51 @@ MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
MIGRAPHX_DEVICE_MATH_HALF
(
tanh
,
::
tanh
)
template
<
class
T
,
class
U
>
constexpr
auto
&
max
(
const
T
&
a
,
const
U
&
b
)
constexpr
auto
where
(
bool
cond
,
const
T
&
a
,
const
U
&
b
)
{
return
(
a
<
b
)
?
b
:
a
;
return
cond
?
a
:
b
;
}
MIGRAPHX_DEVICE_MATH_VEC
(
abs
)
MIGRAPHX_DEVICE_MATH_VEC
(
acos
)
MIGRAPHX_DEVICE_MATH_VEC
(
acosh
)
MIGRAPHX_DEVICE_MATH_VEC
(
asin
)
MIGRAPHX_DEVICE_MATH_VEC
(
asinh
)
MIGRAPHX_DEVICE_MATH_VEC
(
atan
)
MIGRAPHX_DEVICE_MATH_VEC
(
atanh
)
MIGRAPHX_DEVICE_MATH_VEC
(
ceil
)
MIGRAPHX_DEVICE_MATH_VEC
(
cos
)
MIGRAPHX_DEVICE_MATH_VEC
(
cosh
)
MIGRAPHX_DEVICE_MATH_VEC
(
erf
)
MIGRAPHX_DEVICE_MATH_VEC
(
exp
)
MIGRAPHX_DEVICE_MATH_VEC
(
floor
)
MIGRAPHX_DEVICE_MATH_VEC
(
log
)
MIGRAPHX_DEVICE_MATH_VEC
(
pow
)
MIGRAPHX_DEVICE_MATH_VEC
(
round
)
MIGRAPHX_DEVICE_MATH_VEC
(
rsqrt
)
MIGRAPHX_DEVICE_MATH_VEC
(
sin
)
MIGRAPHX_DEVICE_MATH_VEC
(
sinh
)
MIGRAPHX_DEVICE_MATH_VEC
(
sqrt
)
MIGRAPHX_DEVICE_MATH_VEC
(
tan
)
MIGRAPHX_DEVICE_MATH_VEC
(
tanh
)
MIGRAPHX_DEVICE_MATH_VEC
(
where
)
template
<
class
T
,
class
U
>
constexpr
auto
&
m
in
(
const
T
&
a
,
const
U
&
b
)
constexpr
auto
m
ax
(
const
T
&
a
,
const
U
&
b
)
{
return
(
a
>
b
)
?
b
:
a
;
return
where
(
a
<
b
,
b
,
a
)
;
}
template
<
class
T
,
class
U
>
constexpr
T
convert
(
U
x
)
constexpr
auto
min
(
const
T
&
a
,
const
U
&
b
)
{
return
x
;
return
where
(
a
>
b
,
b
,
a
);
}
template
<
class
T
,
class
U
>
constexpr
auto
convert
(
U
v
)
{
return
vec_transform
(
v
)([](
auto
x
)
->
T
{
return
x
;
});
}
}
// namespace migraphx
...
...
src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
100755 → 100644
View file @
7f65a88e
...
...
@@ -10,13 +10,38 @@
namespace
migraphx
{
template
<
class
T
>
struct
implicit_conversion_op
{
T
x
;
template
<
index_int
N
,
class
U
>
constexpr
operator
vec
<
U
,
N
>
()
const
{
static_assert
(
vec_size
<
T
>
()
==
N
,
"Vector mismatch size"
);
return
__builtin_convertvector
(
x
,
vec
<
U
,
N
>
);
}
template
<
class
U
>
constexpr
operator
U
()
const
{
return
x
;
}
};
template
<
class
T
>
constexpr
implicit_conversion_op
<
T
>
implicit_conversion
(
T
x
)
{
return
{
x
};
}
template
<
class
F
,
class
T
,
class
...
Ts
>
__device__
void
pointwise_tensor
(
index
idx
,
F
f
,
T
out
,
Ts
...
xs
)
{
preload
<
typename
T
::
type
>
(
idx
,
xs
...)([
&
](
auto
...
ps
)
{
idx
.
global_stride
(
out
.
get_shape
().
elements
(),
[
&
](
auto
i
)
{
auto
multi_idx
=
out
.
get_shape
().
multi
(
i
);
out
[
multi_idx
]
=
f
(
ps
[
multi_idx
]...);
out
[
multi_idx
]
=
implicit_conversion
(
f
(
ps
[
multi_idx
]...)
)
;
});
});
}
...
...
@@ -24,7 +49,7 @@ __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
template
<
class
F
,
class
...
Ts
>
__device__
void
pointwise
(
F
f
,
Ts
*
...
ps
)
{
auto
t
=
transform_args
(
make_tensors
(),
rotate_last
());
auto
t
=
transform_args
(
make_tensors
(),
rotate_last
()
,
auto_vectorize
()
);
t
(
ps
...)([
&
](
auto
...
xs
)
{
auto
idx
=
make_index
();
pointwise_tensor
(
idx
,
f
,
xs
...);
...
...
src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
100755 → 100644
View file @
7f65a88e
...
...
@@ -29,7 +29,7 @@ constexpr auto traverse_preload(Shapes... ss)
}
template
<
class
T
,
class
...
Shapes
>
constexpr
index_int
compute_preload_size
(
Shapes
...)
constexpr
index_int
compute_preload_size
_c
(
Shapes
...)
{
index_int
size
=
0
;
traverse_preload
<
T
>
(
Shapes
{}...)(
...
...
@@ -37,6 +37,12 @@ constexpr index_int compute_preload_size(Shapes...)
return
size
;
}
template
<
class
T
,
class
...
Shapes
>
constexpr
auto
compute_preload_size
(
Shapes
...)
{
return
_c
<
compute_preload_size_c
<
T
>
(
Shapes
{}...)
>
;
}
template
<
class
F
,
class
T
,
class
...
Ts
>
__device__
auto
preload_copy
(
index
idx
,
F
f
,
__shared__
T
*
buffer
,
Ts
...
xs
)
{
...
...
@@ -48,11 +54,21 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
[
&
](
auto
x
,
auto
offset
,
auto
copy
)
{
if
constexpr
(
copy
)
{
auto
v
=
vectorize
(
x
);
auto
b
=
as_vec
(
tensor_vec_size
(
v
),
buffer
+
offset
);
idx
.
local_stride
(
v
.
get_shape
().
element_space
(),
[
&
](
auto
i
)
{
b
[
i
]
=
v
.
data
()[
i
];
});
return
x
.
with
(
buffer
+
offset
);
if
constexpr
(
decltype
(
tensor_vec_size
(
x
)){}
==
0
)
{
auto
v
=
vectorize
(
x
);
auto
b
=
as_vec
(
tensor_vec_size
(
v
),
buffer
+
offset
);
idx
.
local_stride
(
v
.
get_shape
().
element_space
(),
[
&
](
auto
i
)
{
b
[
i
]
=
v
.
data
()[
i
];
});
return
x
.
with
(
buffer
+
offset
);
}
else
{
auto
b
=
as_vec
(
tensor_vec_size
(
x
),
buffer
+
offset
);
idx
.
local_stride
(
x
.
get_shape
().
element_space
(),
[
&
](
auto
i
)
{
b
[
i
]
=
x
.
data
()[
i
];
});
return
x
.
with
(
b
);
}
}
else
{
...
...
@@ -78,7 +94,7 @@ template <class T, class... Ts>
__device__
auto
preload
(
index
idx
,
Ts
...
xs
)
{
using
type
=
typename
remove_vec
<
T
>::
type
;
constexpr
auto
size
=
compute_preload_size
<
type
>
(
xs
.
get_shape
()...);
constexpr
auto
size
=
decltype
(
compute_preload_size
<
type
>
(
xs
.
get_shape
()...)
){}
;
const
index_int
max_size
=
512
*
sizeof
(
type
);
return
[
=
](
auto
f
)
{
if
constexpr
(
size
>
0
and
size
<
max_size
)
...
...
src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
100755 → 100644
View file @
7f65a88e
...
...
@@ -3,6 +3,7 @@
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/functional.hpp>
namespace
migraphx
{
...
...
@@ -24,6 +25,38 @@ constexpr auto vec_size()
return
decltype
(
vec_size
(
T
{})){};
}
template
<
class
...
Ts
>
constexpr
auto
is_any_vec
()
{
if
constexpr
(
sizeof
...(
Ts
)
==
0
)
return
false_type
{};
else
return
bool_constant
<
((
vec_size
<
Ts
>
()
+
...)
>
0
)
>
{};
}
template
<
class
T
,
class
I
>
constexpr
auto
vec_at
(
T
x
,
I
i
)
{
if
constexpr
(
vec_size
<
T
>
()
==
0
)
return
x
;
else
{
MIGRAPHX_ASSERT
(
i
<
vec_size
<
T
>
());
return
x
[
i
];
}
}
template
<
class
...
Ts
>
constexpr
auto
common_vec_size
()
{
return
fold
([](
auto
x
,
auto
y
)
{
if
constexpr
(
x
>
y
)
return
x
;
else
return
y
;
})(
vec_size
<
Ts
>
()...);
}
template
<
index_int
N
,
class
T
>
__device__
__host__
auto
as_vec
(
T
*
x
)
{
...
...
@@ -33,5 +66,25 @@ __device__ __host__ auto as_vec(T* x)
return
reinterpret_cast
<
vec
<
T
,
N
>*>
(
x
);
}
template
<
class
...
Ts
>
constexpr
auto
vec_transform
(
Ts
...
xs
)
{
return
[
=
](
auto
f
)
{
if
constexpr
(
is_any_vec
<
Ts
...
>
())
{
using
type
=
decltype
(
f
(
vec_at
(
xs
,
0
)...));
constexpr
auto
size
=
common_vec_size
<
Ts
...
>
();
vec
<
type
,
size
>
result
=
{
0
};
for
(
int
i
=
0
;
i
<
size
;
i
++
)
result
[
i
]
=
f
(
vec_at
(
xs
,
i
)...);
return
result
;
}
else
{
return
f
(
xs
...);
}
};
}
}
// namespace migraphx
#endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
View file @
7f65a88e
...
...
@@ -7,40 +7,70 @@
namespace
migraphx
{
template
<
class
T
>
constexpr
auto
tensor_vec_size
(
T
)
constexpr
auto
tensor_vec_size
()
{
return
vec_size
<
typename
T
::
type
>
();
}
template
<
index_int
N
,
class
Shape
>
constexpr
auto
as_vec_shape
(
Shape
s
)
template
<
class
T
>
constexpr
auto
tensor_vec_size
(
T
)
{
auto
lens
=
transform
(
s
.
lens
,
s
.
strides
,
[](
auto
len
,
auto
stride
)
{
if
(
stride
==
1
)
return
len
/
N
;
else
return
len
;
});
auto
strides
=
transform
(
s
.
strides
,
[](
auto
stride
)
{
if
(
stride
==
1
)
return
stride
;
return
stride
/
N
;
return
tensor_vec_size
<
T
>
();
}
template
<
index_int
N
,
class
Shape
,
class
Axis
>
constexpr
auto
shape_step
(
Shape
s
,
Axis
)
{
static_assert
(
N
>
0
,
"Vector size must be non-zero"
);
return
sequence
(
s
.
lens
.
size
(),
[
&
](
auto
...
is
)
{
auto
lens
=
transform
(
s
.
lens
,
index_ints
<
is
...
>
{},
[
&
](
auto
i
,
auto
j
)
{
constexpr
auto
axis
=
Axis
::
to
();
MIGRAPHX_ASSERT
(
i
!=
0
);
MIGRAPHX_ASSERT
(
j
!=
axis
or
i
%
N
==
0
);
if
(
j
==
axis
)
return
i
/
N
;
else
return
i
;
});
auto
strides
=
transform
(
s
.
strides
,
index_ints
<
is
...
>
{},
[
&
](
auto
i
,
auto
j
)
{
constexpr
auto
axis
=
Axis
::
to
();
// If stride of the axis is zero then we dont need to adjust the other strides
if
(
Shape
{}.
strides
[
axis
]
==
0
)
return
i
;
MIGRAPHX_ASSERT
(
j
==
axis
or
i
%
N
==
0
);
if
(
j
==
axis
)
return
i
;
else
return
i
/
N
;
});
MIGRAPHX_ASSERT
(
make_shape
(
lens
,
strides
).
elements
()
*
N
==
s
.
elements
());
MIGRAPHX_ASSERT
(
strides
[
Axis
{}]
==
0
or
make_shape
(
lens
,
strides
).
element_space
()
*
N
==
s
.
element_space
());
return
make_shape
(
lens
,
strides
);
});
MIGRAPHX_ASSERT
(
make_shape
(
lens
,
strides
).
element_space
()
*
N
==
s
.
element_space
());
return
make_shape
(
lens
,
strides
);
}
template
<
index_int
N
,
class
T
>
__device__
__host__
auto
as_vec
(
T
x
)
// Bools can not be used as a vector type so convert it to int8
template
<
class
T
>
__device__
__host__
T
*
remove_bool
(
T
*
x
)
{
return
x
;
}
inline
__device__
__host__
int8_t
*
remove_bool
(
bool
*
x
)
{
return
reinterpret_cast
<
int8_t
*>
(
x
);
}
template
<
index_int
N
,
class
T
,
class
Axis
>
__device__
__host__
auto
as_vec
(
T
x
,
Axis
axis
)
{
if
constexpr
(
N
==
0
)
return
x
;
else
return
make_tensor_view
(
as_vec
<
N
>
(
x
.
data
()),
as_vec_shape
<
N
>
(
x
.
get_shape
()));
return
make_tensor_view
(
as_vec
<
N
>
(
remove_bool
(
x
.
data
())),
shape_step
<
N
>
(
x
.
get_shape
(),
axis
));
}
template
<
index_int
N
,
class
T
,
class
Axis
>
constexpr
auto
tensor_step
(
T
x
,
Axis
)
constexpr
auto
tensor_step
(
T
x
,
Axis
axis
)
{
if
constexpr
(
N
==
0
)
{
...
...
@@ -49,17 +79,8 @@ constexpr auto tensor_step(T x, Axis)
else
{
constexpr
auto
s
=
decltype
(
x
.
get_shape
()){};
MIGRAPHX_ASSERT
(
s
.
strides
[
Axis
{}]
==
0
);
return
sequence
(
x
.
get_shape
().
lens
.
size
(),
[
&
](
auto
...
is
)
{
auto
lens
=
transform
(
s
.
lens
,
index_ints
<
is
...
>
{},
[
&
](
auto
i
,
auto
j
)
{
constexpr
auto
axis
=
Axis
{};
if
(
j
==
axis
)
return
i
/
N
;
else
return
i
;
});
return
make_tensor_view
(
x
.
data
(),
make_shape
(
lens
,
s
.
strides
));
});
MIGRAPHX_ASSERT
(
s
.
strides
[
axis
]
==
0
);
return
make_tensor_view
(
x
.
data
(),
shape_step
<
N
>
(
s
,
axis
));
}
}
...
...
@@ -69,45 +90,71 @@ __device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
return
as_vec
<
ic
>
(
x
);
}
template
<
class
...
Shape
s
>
constexpr
index_int
find_vector_axis
(
Shape
s
...
s
s
)
template
<
class
Shape
>
constexpr
index_int
find_vector_axis
_c
(
Shape
s
)
{
// Find the fastest axis that is not broadcasted
index_int
axis
=
0
;
bool
b
=
false
;
for
(
index_int
i
=
1
;
i
<
s
.
lens
.
size
();
i
++
)
{
if
(
s
.
strides
[
i
]
==
0
)
continue
;
if
(
s
.
strides
[
axis
]
==
0
or
pack_compare
(
less
{},
pack
(
s
.
strides
[
i
],
s
.
lens
[
i
]),
pack
(
s
.
strides
[
axis
],
s
.
lens
[
axis
])))
axis
=
i
;
}
return
axis
;
}
template
<
class
...
Shapes
>
constexpr
index_int
find_vector_axis_c
(
Shapes
...
ss
)
{
const
bool
all_broadcasted
=
(
ss
.
broadcasted
()
and
...);
index_int
axis
=
0
;
bool
b
=
false
;
by
([
&
](
auto
s
)
{
if
(
b
)
return
;
auto
it
=
find
(
s
.
strides
.
begin
(),
s
.
strides
.
end
(),
1
);
if
(
it
==
s
.
strides
.
en
d
())
// Skip broadcasted shapes if there are shapes not broadcasted
if
(
not
all_broadcasted
and
s
.
broadcaste
d
())
return
;
axis
=
it
-
s
.
strides
.
begin
();
b
=
true
;
axis
=
find_vector_axis_c
(
s
);
if
(
s
.
strides
[
axis
]
==
1
)
b
=
true
;
})(
ss
...);
if
(
not
b
)
return
-
1
;
return
axis
;
}
template
<
class
...
Shapes
>
constexpr
auto
find_vector_axis
(
Shapes
...)
{
return
_c
<
find_vector_axis_c
(
Shapes
{}...)
>
;
}
template
<
index_int
N
,
class
Axis
,
class
...
Shapes
>
constexpr
auto
is_vectorizable
(
Axis
axis
,
Shapes
...
ss
)
constexpr
auto
is_vectorizable
_c
(
Axis
axis
,
Shapes
...
ss
)
{
return
(((
ss
.
lens
[
axis
]
%
N
)
==
0
and
ss
.
strides
[
axis
]
==
1
)
and
...);
return
((
axis
<
ss
.
lens
.
size
()
and
ss
.
lens
[
axis
]
%
N
==
0
and
// Only vectorize broadcasted types with stride 0, since this causes issues in the
// preloader
((
not
ss
.
broadcasted
()
and
ss
.
strides
[
axis
]
==
1
)
or
ss
.
strides
[
axis
]
==
0
))
and
...);
}
template
<
index_int
N
,
class
Shape
>
constexpr
bool
is_vectorizable
(
Shape
s
)
template
<
index_int
N
,
class
Axis
,
class
...
Shape
s
>
constexpr
auto
is_vectorizable
(
Axis
,
Shapes
...
)
{
auto
it
=
find
(
s
.
strides
.
begin
(),
s
.
strides
.
end
(),
1
);
if
(
it
==
s
.
strides
.
end
())
return
false
;
auto
axis
=
it
-
s
.
strides
.
begin
();
return
(
s
.
lens
[
axis
]
%
N
)
==
0
and
s
.
strides
[
axis
]
==
1
;
return
_c
<
is_vectorizable_c
<
N
>
(
Axis
::
to
(),
Shapes
{}...)
>
;
}
template
<
class
P
>
constexpr
auto
find_vectorize_size
(
P
pred
)
{
if
constexpr
(
pred
(
_c
<
4
>
))
if
constexpr
(
decltype
(
pred
(
_c
<
4
>
))
{})
return
_c
<
4
>
;
else
if
constexpr
(
pred
(
_c
<
2
>
))
else
if
constexpr
(
decltype
(
pred
(
_c
<
2
>
))
{})
return
_c
<
2
>
;
else
return
_c
<
0
>
;
...
...
@@ -116,11 +163,12 @@ constexpr auto find_vectorize_size(P pred)
template
<
class
T
>
__host__
__device__
auto
vectorize
(
T
x
)
{
if
constexpr
(
vec_size
<
T
>
()
==
0
)
if
constexpr
(
tensor_
vec_size
<
T
>
()
==
0
)
{
constexpr
auto
axis
=
find_vector_axis
(
x
.
get_shape
());
constexpr
auto
n
=
find_vectorize_size
([
&
](
auto
i
)
{
return
_c
<
is_vectorizable
<
i
>
(
x
.
get_shape
())
>
;
});
return
as_vec
<
n
>
(
x
);
find_vectorize_size
([
&
](
auto
i
)
{
return
is_vectorizable
<
i
>
(
axis
,
x
.
get_shape
());
});
return
as_vec
<
n
>
(
x
,
axis
);
}
else
{
...
...
@@ -128,34 +176,46 @@ __host__ __device__ auto vectorize(T x)
}
}
template
<
class
F
,
class
...
Ts
>
inline
__device__
__host__
auto
auto_vectorize_impl
(
F
f
,
Ts
...
xs
)
{
// TODO: Just check there a single axis of 1
constexpr
bool
packed_or_broadcasted
=
((
xs
.
get_shape
().
packed
()
or
xs
.
get_shape
().
broadcasted
())
and
...);
if
constexpr
(
packed_or_broadcasted
)
{
constexpr
auto
axis
=
decltype
(
find_vector_axis
(
xs
.
get_shape
()...)){};
constexpr
auto
n
=
find_vectorize_size
(
[
&
](
auto
i
)
{
return
is_vectorizable
<
i
>
(
axis
,
xs
.
get_shape
()...);
});
by
(
[
&
](
auto
x
)
{
constexpr
auto
s
=
decltype
(
x
.
get_shape
()){};
if
constexpr
(
axis
<
s
.
strides
.
size
())
{
MIGRAPHX_ASSERT
(
s
.
strides
[
axis
]
==
0
or
s
.
strides
[
axis
]
==
1
);
MIGRAPHX_ASSERT
(
s
.
lens
[
axis
]
>
0
);
MIGRAPHX_ASSERT
(
n
==
0
or
s
.
lens
[
axis
]
%
n
==
0
);
if
constexpr
(
s
.
strides
[
axis
]
==
0
)
return
tensor_step
<
n
>
(
x
,
axis
);
else
return
as_vec
<
n
>
(
x
,
axis
);
}
else
{
return
x
;
}
},
f
)(
xs
...);
}
else
{
f
(
xs
...);
}
}
inline
__device__
__host__
auto
auto_vectorize
()
{
return
[](
auto
...
xs
)
{
return
[
=
](
auto
f
)
{
// TODO: Just check there a single axis of 1
constexpr
bool
packed_or_broadcasted
=
((
xs
.
get_shape
().
packed
()
or
xs
.
get_shape
().
broadcasted
())
and
...);
if
constexpr
(
packed_or_broadcasted
)
{
constexpr
auto
axis
=
find_vector_axis
(
xs
.
get_shape
()...);
constexpr
auto
n
=
find_vectorize_size
(
[
&
](
auto
i
)
{
return
_c
<
is_vectorizable
<
i
>
(
axis
,
xs
.
get_shape
()...)
>
;
});
by
(
[
&
](
auto
x
)
{
constexpr
auto
s
=
x
.
get_shape
();
if
constexpr
(
s
.
strides
[
axis
]
==
0
)
return
tensor_step
<
n
>
(
x
,
axis
);
else
return
as_vec
<
n
>
(
x
);
},
f
)(
xs
...);
}
else
{
f
(
xs
...);
}
};
};
return
[](
auto
...
xs
)
{
return
[
=
](
auto
f
)
{
auto_vectorize_impl
(
f
,
xs
...);
};
};
}
}
// namespace migraphx
...
...
test/onnx/gen_onnx.py
View file @
7f65a88e
...
...
@@ -1618,6 +1618,22 @@ def greater_bool_test():
return
([
node1
,
node2
],
[
x1
,
x2
],
[
y
])
@
onnx_test
def
greaterorequal_test
():
x1
=
helper
.
make_tensor_value_info
(
'x1'
,
TensorProto
.
FLOAT
,
[
3
])
x2
=
helper
.
make_tensor_value_info
(
'x2'
,
TensorProto
.
FLOAT
,
[
3
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT
,
[
3
])
node
=
onnx
.
helper
.
make_node
(
'GreaterOrEqual'
,
inputs
=
[
'x1'
,
'x2'
],
outputs
=
[
'y'
],
)
return
([
node
],
[
x1
,
x2
],
[
y
])
@
onnx_test
def
group_conv_test
():
x
=
helper
.
make_tensor_value_info
(
'0'
,
TensorProto
.
FLOAT
,
[
1
,
4
,
16
,
16
])
...
...
@@ -1634,6 +1650,60 @@ def group_conv_test():
return
([
node
],
[
x
,
y
],
[
z
])
@
onnx_test
def
hardsigmoid_default_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT
,
[
1
,
3
,
4
,
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT
,
[
1
,
3
,
4
,
5
])
node
=
onnx
.
helper
.
make_node
(
'HardSigmoid'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
])
return
([
node
],
[
x
],
[
y
])
@
onnx_test
def
hardsigmoid_double_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
DOUBLE
,
[
1
,
3
,
4
,
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
DOUBLE
,
[
1
,
3
,
4
,
5
])
node
=
onnx
.
helper
.
make_node
(
'HardSigmoid'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
],
alpha
=
0.3
,
beta
=
0.7
)
return
([
node
],
[
x
],
[
y
])
@
onnx_test
def
hardsigmoid_half_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT16
,
[
1
,
3
,
4
,
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT16
,
[
1
,
3
,
4
,
5
])
node
=
onnx
.
helper
.
make_node
(
'HardSigmoid'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
])
return
([
node
],
[
x
],
[
y
])
@
onnx_test
def
hardsigmoid_verify_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT
,
[
2
,
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT
,
[
2
,
5
])
node
=
onnx
.
helper
.
make_node
(
'HardSigmoid'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
])
return
([
node
],
[
x
],
[
y
])
@
onnx_test
def
hardswish_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT
,
[
2
,
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT
,
[
2
,
5
])
node
=
onnx
.
helper
.
make_node
(
'HardSwish'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
])
return
([
node
],
[
x
],
[
y
])
@
onnx_test
def
if_else_test
():
x
=
onnx
.
helper
.
make_tensor_value_info
(
'x'
,
onnx
.
TensorProto
.
FLOAT
,
[
2
,
3
])
...
...
@@ -2692,6 +2762,80 @@ def maxpool_same_upper_test():
return
([
node
],
[
x
],
[
y
])
@
onnx_test
def
mean_broadcast_test
():
data_0
=
helper
.
make_tensor_value_info
(
'0'
,
TensorProto
.
FLOAT
,
[
1
,
3
,
4
])
data_1
=
helper
.
make_tensor_value_info
(
'1'
,
TensorProto
.
FLOAT
,
[
1
,
2
,
3
,
4
])
data_2
=
helper
.
make_tensor_value_info
(
'2'
,
TensorProto
.
FLOAT
,
[
4
])
data_3
=
helper
.
make_tensor_value_info
(
'3'
,
TensorProto
.
FLOAT
,
[
1
])
data_4
=
helper
.
make_tensor_value_info
(
'4'
,
TensorProto
.
FLOAT
,
[
2
,
3
,
1
])
mean
=
helper
.
make_tensor_value_info
(
'mean'
,
TensorProto
.
FLOAT
,
[
1
,
2
,
3
,
4
])
node
=
onnx
.
helper
.
make_node
(
"Mean"
,
inputs
=
[
"0"
,
"1"
,
"2"
,
"3"
,
"4"
],
outputs
=
[
"mean"
])
return
([
node
],
[
data_0
,
data_1
,
data_2
,
data_3
,
data_4
],
[
mean
])
@
onnx_test
def
mean_fp16_test
():
data_0
=
helper
.
make_tensor_value_info
(
'0'
,
TensorProto
.
FLOAT16
,
[
1
,
2
,
3
])
data_1
=
helper
.
make_tensor_value_info
(
'1'
,
TensorProto
.
FLOAT16
,
[
1
,
2
,
3
])
data_2
=
helper
.
make_tensor_value_info
(
'2'
,
TensorProto
.
FLOAT16
,
[
1
,
2
,
3
])
mean
=
helper
.
make_tensor_value_info
(
'mean'
,
TensorProto
.
FLOAT16
,
[
1
,
2
,
3
])
node
=
onnx
.
helper
.
make_node
(
"Mean"
,
inputs
=
[
"0"
,
"1"
,
"2"
],
outputs
=
[
"mean"
])
return
([
node
],
[
data_0
,
data_1
,
data_2
],
[
mean
])
@
onnx_test
def
mean_invalid_broadcast_test
():
data_0
=
helper
.
make_tensor_value_info
(
'0'
,
TensorProto
.
FLOAT
,
[
1
,
2
,
3
])
data_1
=
helper
.
make_tensor_value_info
(
'1'
,
TensorProto
.
FLOAT
,
[
1
,
2
,
3
])
data_2
=
helper
.
make_tensor_value_info
(
'2'
,
TensorProto
.
FLOAT
,
[
1
,
2
,
4
])
mean
=
helper
.
make_tensor_value_info
(
'mean'
,
TensorProto
.
FLOAT
,
[
1
,
2
,
3
])
node
=
onnx
.
helper
.
make_node
(
"Mean"
,
inputs
=
[
"0"
,
"1"
,
"2"
],
outputs
=
[
"mean"
])
return
([
node
],
[
data_0
,
data_1
,
data_2
],
[
mean
])
@
onnx_test
def
mean_single_input_test
():
data_0
=
helper
.
make_tensor_value_info
(
'0'
,
TensorProto
.
FLOAT
,
[
1
,
2
,
3
])
mean
=
helper
.
make_tensor_value_info
(
'mean'
,
TensorProto
.
FLOAT
,
[
1
,
2
,
3
])
node
=
onnx
.
helper
.
make_node
(
"Mean"
,
inputs
=
[
"0"
],
outputs
=
[
"mean"
])
return
([
node
],
[
data_0
],
[
mean
])
@
onnx_test
def
mean_test
():
data
=
[
helper
.
make_tensor_value_info
(
str
(
i
),
TensorProto
.
DOUBLE
,
[
2
,
2
,
2
])
for
i
in
range
(
10
)
]
data_names
=
[
str
(
i
)
for
i
in
range
(
10
)]
mean
=
helper
.
make_tensor_value_info
(
'mean'
,
TensorProto
.
DOUBLE
,
[
2
,
2
,
2
])
node
=
onnx
.
helper
.
make_node
(
"Mean"
,
inputs
=
data_names
,
outputs
=
[
"mean"
])
return
([
node
],
data
,
[
mean
])
@
onnx_test
def
min_test
():
a
=
helper
.
make_tensor_value_info
(
'0'
,
TensorProto
.
FLOAT
,
[
3
])
...
...
@@ -2725,6 +2869,21 @@ def multinomial_test():
return
([
node
],
[
input
],
[
output
])
@
onnx_test
def
multinomial_generated_seed_test
():
sample_size
=
10
input
=
helper
.
make_tensor_value_info
(
"input"
,
TensorProto
.
FLOAT
,
[
1
,
10
])
output
=
helper
.
make_tensor_value_info
(
"output"
,
TensorProto
.
INT32
,
[
1
,
10
])
node
=
onnx
.
helper
.
make_node
(
'Multinomial'
,
inputs
=
[
'input'
],
sample_size
=
sample_size
,
outputs
=
[
'output'
])
return
([
node
],
[
input
],
[
output
])
@
onnx_test
def
multinomial_dtype_error_test
():
sample_size
=
10
...
...
@@ -3176,6 +3335,21 @@ def randomnormal_dtype_error_test():
return
([
node
],
[],
[
output
])
@
onnx_test
def
randomnormal_generated_seed_test
():
sample_size
=
10
input
=
helper
.
make_tensor_value_info
(
"input"
,
TensorProto
.
FLOAT
,
[
1
,
10
])
output
=
helper
.
make_tensor_value_info
(
"output"
,
TensorProto
.
INT32
,
[
1
,
10
])
node
=
onnx
.
helper
.
make_node
(
'RandomNormal'
,
inputs
=
[
'input'
],
sample_size
=
sample_size
,
outputs
=
[
'output'
])
return
([
node
],
[
input
],
[
output
])
@
onnx_test
def
randomnormal_shape_error_test
():
dtype
=
1
...
...
@@ -3266,6 +3440,21 @@ def randomuniform_dtype_error_test():
return
([
node
],
[],
[
output
])
@
onnx_test
def
randomuniform_generated_seed_test
():
sample_size
=
10
input
=
helper
.
make_tensor_value_info
(
"input"
,
TensorProto
.
FLOAT
,
[
1
,
10
])
output
=
helper
.
make_tensor_value_info
(
"output"
,
TensorProto
.
INT32
,
[
1
,
10
])
node
=
onnx
.
helper
.
make_node
(
'RandomUniform'
,
inputs
=
[
'input'
],
sample_size
=
sample_size
,
outputs
=
[
'output'
])
return
([
node
],
[
input
],
[
output
])
@
onnx_test
def
randomuniform_shape_error_test
():
dtype
=
1
...
...
@@ -4290,6 +4479,44 @@ def softmax_nonstd_input_test():
return
([
node0
,
node1
],
[
x
],
[
y
])
@
onnx_test
def
softsign_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT
,
[
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT
,
[
5
])
node
=
onnx
.
helper
.
make_node
(
'Softsign'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
])
return
([
node
],
[
x
],
[
y
])
def
softplus_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT
,
[
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT
,
[
5
])
node
=
onnx
.
helper
.
make_node
(
'Softplus'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
])
return
([
node
],
[
x
],
[
y
])
@
onnx_test
def
softsign_nd_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT16
,
[
3
,
4
,
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT16
,
[
3
,
4
,
5
])
node
=
onnx
.
helper
.
make_node
(
'Softsign'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
])
return
([
node
],
[
x
],
[
y
])
def
softplus_nd_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT16
,
[
3
,
4
,
5
])
y
=
helper
.
make_tensor_value_info
(
'y'
,
TensorProto
.
FLOAT16
,
[
3
,
4
,
5
])
node
=
onnx
.
helper
.
make_node
(
'Softplus'
,
inputs
=
[
'x'
],
outputs
=
[
'y'
])
return
([
node
],
[
x
],
[
y
])
@
onnx_test
def
split_minus_axis_test
():
x
=
helper
.
make_tensor_value_info
(
'x'
,
TensorProto
.
FLOAT
,
[
10
,
15
])
...
...
@@ -4847,6 +5074,25 @@ def unknown_aten_test():
return
([
node
],
[
x
,
y
],
[
a
])
@
onnx_test
def
upsample_linear_test
():
scales
=
np
.
array
([
1.0
,
1.0
,
2.0
,
2.0
],
dtype
=
np
.
float32
)
scales_tensor
=
helper
.
make_tensor
(
name
=
'scales'
,
data_type
=
TensorProto
.
FLOAT
,
dims
=
scales
.
shape
,
vals
=
scales
.
flatten
().
astype
(
np
.
float32
))
X
=
helper
.
make_tensor_value_info
(
'X'
,
TensorProto
.
FLOAT
,
[
1
,
1
,
2
,
2
])
Y
=
helper
.
make_tensor_value_info
(
'Y'
,
TensorProto
.
FLOAT
,
[])
node
=
onnx
.
helper
.
make_node
(
'Upsample'
,
inputs
=
[
'X'
,
''
,
'scales'
],
outputs
=
[
'Y'
],
mode
=
'linear'
)
return
([
node
],
[
X
],
[
Y
],
[
scales_tensor
])
@
onnx_test
def
upsample_test
():
scales
=
np
.
array
([
1.0
,
1.0
,
2.0
,
3.0
],
dtype
=
np
.
float32
)
...
...
test/onnx/gen_onnx.pyc
View file @
7f65a88e
No preview for this file type
test/onnx/greaterorequal_test.onnx
0 → 100644
View file @
7f65a88e
greaterorequal_test:g
x1
x2y"GreaterOrEqualgreaterorequal_testZ
x1
Z
x2
b
y
B
\ No newline at end of file
test/onnx/hardsigmoid_default_test.onnx
0 → 100644
View file @
7f65a88e
hardsigmoid_default_test:i
xy"HardSigmoidhardsigmoid_default_testZ
x
b
y
B
\ No newline at end of file
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment