Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
a6bde7c0
Unverified
Commit
a6bde7c0
authored
Nov 10, 2023
by
Manupa Karunaratne
Committed by
GitHub
Nov 10, 2023
Browse files
Merge branch 'develop' into mlir-attention
parents
fe36d210
35e5298e
Changes
101
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1218 additions
and
202 deletions
+1218
-202
src/simplify_dyn_ops.cpp
src/simplify_dyn_ops.cpp
+46
-2
src/simplify_reshapes.cpp
src/simplify_reshapes.cpp
+2
-2
src/targets/gpu/CMakeLists.txt
src/targets/gpu/CMakeLists.txt
+16
-5
src/targets/gpu/compile_hip.cpp
src/targets/gpu/compile_hip.cpp
+11
-8
src/targets/gpu/compile_ops.cpp
src/targets/gpu/compile_ops.cpp
+10
-7
src/targets/gpu/gemm_impl.cpp
src/targets/gpu/gemm_impl.cpp
+474
-145
src/targets/gpu/include/migraphx/gpu/gemm.hpp
src/targets/gpu/include/migraphx/gpu/gemm.hpp
+37
-6
src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
+51
-13
src/targets/gpu/include/migraphx/gpu/rocblas.hpp
src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+1
-1
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+6
-0
src/version.h.in
src/version.h.in
+1
-1
test/api/CMakeLists.txt
test/api/CMakeLists.txt
+6
-4
test/api/test_cpu.cpp
test/api/test_cpu.cpp
+25
-0
test/api/test_gpu.cpp
test/api/test_gpu.cpp
+55
-0
test/gpu/codegen_literal.cpp
test/gpu/codegen_literal.cpp
+1
-1
test/gpu/gemm_tune.cpp
test/gpu/gemm_tune.cpp
+225
-0
test/onnx/.onnxrt-commit
test/onnx/.onnxrt-commit
+1
-1
test/onnx/gen_onnx.py
test/onnx/gen_onnx.py
+250
-6
test/onnx/isinf_double_pos_test.onnx
test/onnx/isinf_double_pos_test.onnx
+0
-0
test/onnx/isinf_half_neg_test.onnx
test/onnx/isinf_half_neg_test.onnx
+0
-0
No files found.
src/simplify_dyn_ops.cpp
View file @
a6bde7c0
...
...
@@ -24,6 +24,7 @@
#include <migraphx/simplify_dyn_ops.hpp>
#include <migraphx/matcher.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/literal.hpp>
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
...
...
@@ -131,10 +132,53 @@ struct find_const_4in_slice
}
};
/**
* Simplify dimensions_of to a literal when the input arugment has a static shape
* or the dynamic dimensions from `start` to `end` are fixed.
*/
struct
find_static_dimensions_of
{
auto
matcher
()
const
{
return
match
::
name
(
"dimensions_of"
)();
}
void
apply
(
module
&
m
,
const
match
::
matcher_result
&
mr
)
const
{
auto
ins
=
mr
.
result
;
auto
input
=
ins
->
inputs
().
at
(
0
);
auto
dimensions_of_value
=
ins
->
get_operator
().
to_value
();
auto
start
=
dimensions_of_value
.
at
(
"start"
).
to
<
std
::
size_t
>
();
auto
end
=
dimensions_of_value
.
at
(
"end"
).
to
<
std
::
size_t
>
();
if
(
input
->
get_shape
().
dynamic
())
{
// check if dynamic dimensions from start to end are fixed
auto
dds
=
input
->
get_shape
().
dyn_dims
();
if
(
std
::
any_of
(
dds
.
begin
()
+
start
,
dds
.
begin
()
+
end
,
[](
auto
dd
)
{
return
not
dd
.
is_fixed
();
}))
{
return
;
}
}
std
::
size_t
output_ndim
=
end
-
start
;
std
::
vector
<
int64_t
>
vec_shape
(
output_ndim
);
migraphx
::
shape
s
(
migraphx
::
shape
::
int64_type
,
{
output_ndim
});
std
::
vector
<
std
::
size_t
>
input_lens
=
input
->
get_shape
().
to_static
(
1
).
lens
();
std
::
transform
(
input_lens
.
begin
()
+
start
,
input_lens
.
begin
()
+
end
,
vec_shape
.
begin
(),
[](
auto
i
)
{
return
int64_t
(
i
);
});
migraphx
::
shape
output_shape
{
migraphx
::
shape
::
int64_type
,
{
end
-
start
}};
auto
lit_ins
=
m
.
add_literal
(
migraphx
::
literal
{
output_shape
,
vec_shape
});
m
.
replace_instruction
(
ins
,
lit_ins
);
}
};
void
simplify_dyn_ops
::
apply
(
module
&
m
)
const
{
match
::
find_matches
(
m
,
find_static_2in_broadcasts
{},
find_const_3in_slice
{},
find_const_4in_slice
{});
match
::
find_matches
(
m
,
find_static_2in_broadcasts
{},
find_static_dimensions_of
{},
find_const_3in_slice
{},
find_const_4in_slice
{});
}
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/simplify_reshapes.cpp
View file @
a6bde7c0
...
...
@@ -647,8 +647,8 @@ struct find_broadcast_transpose
{
auto
transpose
=
r
.
result
;
auto
transpose_lens
=
transpose
->
get_shape
().
lens
();
auto
bcast_ins
=
r
.
instructions
[
"bcast_ins"
];
auto
input
=
bcast_ins
->
inputs
().
front
();
auto
bcast_ins
=
r
.
instructions
[
"bcast_ins"
];
auto
input
=
bcast_ins
->
inputs
().
front
();
// scalar transformation does not need extra transpose
if
(
not
input
->
get_shape
().
scalar
())
{
...
...
src/targets/gpu/CMakeLists.txt
View file @
a6bde7c0
# ####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
...
...
@@ -231,24 +231,28 @@ else()
string
(
REGEX REPLACE
" /[^ ]+
\\
.(a|so) "
" "
HIP_COMPILER_FLAGS
"
${
HIP_COMPILER_FLAGS
}
"
)
endforeach
()
message
(
STATUS
"Hip compiler flags:
${
HIP_COMPILER_FLAGS
}
"
)
message
(
STATUS
"Hip compiler flags:
\"
${
HIP_COMPILER_FLAGS
}
\"
"
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"
-DMIGRAPHX_HIP_COMPILER=
${
CMAKE_CXX_COMPILER
}
"
"
-DMIGRAPHX_HIP_COMPILER_FLAGS=
${
HIP_COMPILER_FLAGS
}
"
-DMIGRAPHX_HIP_COMPILER=
"
${
CMAKE_CXX_COMPILER
}
"
-DMIGRAPHX_HIP_COMPILER_FLAGS=
"
${
HIP_COMPILER_FLAGS
}
"
)
if
(
DEFINED CMAKE_CXX_COMPILER_LAUNCHER
)
execute_process
(
COMMAND which
${
CMAKE_CXX_COMPILER_LAUNCHER
}
OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER
)
string
(
STRIP
"
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
MIGRAPHX_HIP_COMPILER_LAUNCHER
)
target_compile_definitions
(
migraphx_gpu PRIVATE
"
-DMIGRAPHX_HIP_COMPILER_LAUNCHER=
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
)
target_compile_definitions
(
migraphx_gpu PRIVATE -DMIGRAPHX_HIP_COMPILER_LAUNCHER=
"
${
MIGRAPHX_HIP_COMPILER_LAUNCHER
}
"
)
endif
()
endif
()
# Check miopen find mode api
include
(
CheckLibraryExists
)
get_target_property
(
MIOPEN_LOCATION MIOpen LOCATION
)
get_target_property
(
ROCBLAS_LOCATION roc::rocblas LOCATION
)
check_library_exists
(
MIOpen
"miopenHiddenSetConvolutionFindMode"
"
${
MIOPEN_LOCATION
}
"
HAS_FIND_MODE_API
)
check_library_exists
(
MIOpen
"miopenFindSolutions"
"
${
MIOPEN_LOCATION
}
"
HAS_FIND_2_API
)
# Beta API for automated GEMM tuning
check_library_exists
(
roc::rocblas
"rocblas_gemm_ex_get_solutions"
"
${
ROCBLAS_LOCATION
}
"
HAS_ROCBLAS_TUNING_BETA_FEATURE_API
)
set
(
MIGRAPHX_USE_FIND_2_API
"
${
HAS_FIND_2_API
}
"
CACHE BOOL
""
)
...
...
@@ -271,6 +275,13 @@ else()
message
(
STATUS
"MIOpen does not have find mode api"
)
endif
()
if
(
HAS_ROCBLAS_TUNING_BETA_FEATURE_API
)
target_compile_definitions
(
migraphx_gpu PUBLIC -DMIGRAPHX_USE_ROCBLAS_TUNING_API -DROCBLAS_BETA_FEATURES_API -DROCBLAS_NO_DEPRECATED_WARNINGS
)
message
(
STATUS
"MIGraphx is using Beta API of rocBLAS"
)
else
()
message
(
STATUS
"rocBLAS does not have User Tuning Beta API"
)
endif
()
target_link_libraries
(
migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas
)
target_link_libraries
(
migraphx_gpu PRIVATE migraphx_device migraphx_kernels
)
if
(
MIGRAPHX_USE_COMPOSABLEKERNEL
)
...
...
src/targets/gpu/compile_hip.cpp
View file @
a6bde7c0
...
...
@@ -284,16 +284,20 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
bool
is_hip_clang_compiler
()
{
static
const
auto
result
=
ends_with
(
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
),
"clang++"
)
;
static
const
auto
result
=
fs
::
path
{
MIGRAPHX_HIP_COMPILER
}.
stem
()
==
"clang++"
;
return
result
;
}
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
bool
has_compiler_launcher
()
{
static
const
auto
result
=
fs
::
exists
(
MIGRAPHX_
STRINGIZE
(
MIGRAPHX_
HIP_COMPILER_LAUNCHER
)
)
;
static
const
auto
result
=
fs
::
exists
(
MIGRAPHX_HIP_COMPILER_LAUNCHER
);
return
result
;
}
#endif
src_compiler
assemble
(
src_compiler
compiler
)
{
compiler
.
out_ext
=
".S"
;
...
...
@@ -306,8 +310,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
{
assert
(
not
srcs
.
empty
());
if
(
not
is_hip_clang_compiler
())
MIGRAPHX_THROW
(
"Unknown hip compiler: "
+
std
::
string
(
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
)));
MIGRAPHX_THROW
(
"Unknown hip compiler: "
MIGRAPHX_HIP_COMPILER
);
if
(
params
.
find
(
"-std="
)
==
std
::
string
::
npos
)
params
+=
" --std=c++17"
;
...
...
@@ -323,14 +326,14 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
params
+=
" -DMIGRAPHX_DEBUG"
;
params
+=
" -Wno-unused-command-line-argument -Wno-cuda-compat "
;
params
+=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER_FLAGS
)
;
params
+=
MIGRAPHX_HIP_COMPILER_FLAGS
;
src_compiler
compiler
;
compiler
.
flags
=
params
;
compiler
.
compiler
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
)
;
compiler
.
compiler
=
MIGRAPHX_HIP_COMPILER
;
#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
if
(
has_compiler_launcher
())
compiler
.
launcher
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER_LAUNCHER
)
;
compiler
.
launcher
=
MIGRAPHX_HIP_COMPILER_LAUNCHER
;
#endif
if
(
enabled
(
MIGRAPHX_GPU_DUMP_SRC
{}))
{
...
...
@@ -354,7 +357,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
bool
hip_has_flags
(
const
std
::
vector
<
std
::
string
>&
flags
)
{
src_compiler
compiler
;
compiler
.
compiler
=
MIGRAPHX_STRINGIZE
(
MIGRAPHX_HIP_COMPILER
)
;
compiler
.
compiler
=
MIGRAPHX_HIP_COMPILER
;
compiler
.
flags
=
join_strings
(
flags
,
" "
)
+
" -x hip -c --offload-arch=gfx900 --cuda-device-only"
;
...
...
src/targets/gpu/compile_ops.cpp
View file @
a6bde7c0
...
...
@@ -168,6 +168,7 @@ struct compile_plan
}
const
compiled_result
&
benchmark
(
problem_cache
&
pc
)
const
{
const
auto
trace_level
=
value_of
(
MIGRAPHX_TRACE_BENCHMARKING
{});
if
(
results
.
empty
())
MIGRAPHX_THROW
(
"No configs to tune"
);
if
(
results
.
size
()
==
1
)
...
...
@@ -178,9 +179,10 @@ struct compile_plan
}
if
(
not
config
)
MIGRAPHX_THROW
(
"Multiple kernels without config"
);
std
::
cout
<<
"Benchmarking "
<<
preop
.
name
()
<<
": "
<<
results
.
size
()
<<
" configs"
<<
std
::
endl
;
if
(
enabled
(
MIGRAPHX_TRACE_BENCHMARKING
{}))
if
(
trace_level
>
0
)
std
::
cout
<<
"Benchmarking "
<<
preop
.
name
()
<<
": "
<<
results
.
size
()
<<
" configs"
<<
std
::
endl
;
if
(
trace_level
>
1
)
std
::
cout
<<
"Problem: "
<<
config
->
problem
<<
std
::
endl
;
std
::
vector
<
double
>
times
;
times
.
reserve
(
results
.
size
());
...
...
@@ -189,22 +191,23 @@ struct compile_plan
config
->
solutions
.
begin
(),
std
::
back_inserter
(
times
),
[
&
](
const
auto
&
cr
,
const
auto
&
solution
)
{
if
(
enabled
(
MIGRAPHX_TRACE_BENCHMARKING
{})
)
if
(
trace_level
>
1
)
std
::
cout
<<
"Benchmarking solution: "
<<
solution
<<
std
::
endl
;
if
(
not
cr
.
has_value
())
{
if
(
enabled
(
MIGRAPHX_TRACE_BENCHMARKING
{})
)
if
(
trace_level
>
1
)
std
::
cout
<<
"No binary"
<<
std
::
endl
;
return
std
::
numeric_limits
<
double
>::
max
();
}
auto
t
=
time_op
(
*
ctx
,
cr
->
replace
.
code_object
,
to_shapes
(
cr
->
ins
->
inputs
()),
20
);
if
(
enabled
(
MIGRAPHX_TRACE_BENCHMARKING
{})
)
if
(
trace_level
>
1
)
std
::
cout
<<
t
<<
"ms"
<<
std
::
endl
;
return
t
;
});
auto
i
=
std
::
distance
(
times
.
begin
(),
std
::
min_element
(
times
.
begin
(),
times
.
end
()));
std
::
cout
<<
"Fastest solution: "
<<
config
->
solutions
.
at
(
i
)
<<
std
::
endl
;
if
(
trace_level
>
0
)
std
::
cout
<<
"Fastest solution: "
<<
config
->
solutions
.
at
(
i
)
<<
std
::
endl
;
pc
.
insert
(
preop
.
name
(),
config
->
problem
,
config
->
solutions
.
at
(
i
));
if
(
not
results
[
i
].
has_value
())
MIGRAPHX_THROW
(
"No valid tuned compilation."
);
...
...
src/targets/gpu/gemm_impl.cpp
View file @
a6bde7c0
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
...
...
@@ -21,15 +21,20 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <rocblas/rocblas.h>
#include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/permutation.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/time.hpp>
using
microseconds
=
std
::
chrono
::
duration
<
double
,
std
::
micro
>
;
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
// Convert rocBLAS datatypes to equivalent Migraphx data types
rocblas_datatype
get_type
(
shape
::
type_t
type
)
{
switch
(
type
)
...
...
@@ -81,184 +86,508 @@ shape transpose_batch(const shape& s, unsigned trans_batch)
return
shape
::
from_permutation
(
s
.
type
(),
s
.
lens
(),
perm
);
}
template
<
class
R
,
class
...
Ts
,
class
...
Us
>
R
rocblas_invoke
(
R
(
*
f
)(
Ts
...),
Us
...
xs
)
/**
* Returns results of rocblas_status_success, rocblas_status_perf_degraded,
* or rocblas_status_invalid_value. Caller
* is expected to check for invalid index. Any other result causes an exception.
*
*/
template
<
class
F
,
class
Pack
,
class
...
Ts
>
auto
rocblas_invoke
(
F
f
,
Pack
p
,
Ts
...
xs
)
{
if
constexpr
(
sizeof
...(
Ts
)
==
sizeof
...(
Us
))
return
f
(
xs
...);
else
return
f
(
xs
...,
nullptr
,
nullptr
);
return
p
([
=
](
auto
...
ws
)
{
auto
status
=
f
(
ws
...,
xs
...);
if
(
status
!=
rocblas_status_success
and
status
!=
rocblas_status_invalid_value
)
{
if
(
status
==
rocblas_status_perf_degraded
)
{
std
::
cerr
<<
"WARNING: degraded perf. in rocBLAS call"
<<
std
::
endl
;
}
else
MIGRAPHX_THROW
(
"rocblas_invoke: rocBLAS call failed with status "
+
std
::
to_string
(
status
));
}
return
status
;
});
}
static
bool
is_transposed
(
const
shape
&
s
)
{
if
(
not
s
.
transposed
())
return
false
;
return
s
.
strides
().
back
()
!=
1
;
}
static
bool
is_transposed
(
const
shape
&
s
)
{
return
s
.
transposed
()
and
s
.
strides
().
back
()
!=
1
;
}
static
rocblas_int
get_batch_stride
(
const
argument
&
a
)
static
rocblas_int
get_batch_stride
(
const
shape
&
s
)
{
return
a
.
get_shape
().
strides
()[
a
.
get_shape
().
strides
().
size
()
-
3
];
// This value is not needed for non-strided inputs
if
(
s
.
strides
().
size
()
<
3
)
return
0
;
else
return
s
.
strides
()[
s
.
strides
().
size
()
-
3
];
}
template
<
class
T
>
void
gemm_impl
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
T
alpha
,
T
beta
,
bool
compute_fp32
)
/**
* Wrapper for multiple rocBLAS calls. The constructor creates parameters for
* these calls based on data shapes and other values contained in the associated
* instruction and operation.
*
* The template parameter T is not the type of the matrix data but of the weighting
* coefficients alpha and beta (these are float in rocBLAS internals)
*/
template
<
typename
T
>
struct
gemm_impl
{
const
bool
is_3inputs
=
(
args
.
size
()
==
4
);
if
(
not
is_3inputs
)
{
beta
=
0
;
}
bool
transa
=
is_transposed
(
args
[
0
].
get_shape
());
bool
transb
=
is_transposed
(
args
[
1
].
get_shape
());
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_1
=
n_dim
-
1
;
auto
dim_0
=
n_dim
-
2
;
rocblas_int
lda
=
args
[
0
].
get_shape
().
strides
()[
transa
?
dim_1
:
dim_0
];
rocblas_int
ldb
=
args
[
1
].
get_shape
().
strides
()[
transb
?
dim_1
:
dim_0
];
rocblas_int
ldc
=
args
[
2
].
get_shape
().
strides
()[
dim_0
];
rocblas_int
ldd
=
is_3inputs
?
args
[
3
].
get_shape
().
strides
()[
dim_0
]
:
ldc
;
rocblas_datatype
arg_type
=
get_type
(
args
[
0
].
get_shape
().
type
());
auto
output_type
=
arg_type
;
if
(
output_type
==
rocblas_datatype_i8_r
)
{
output_type
=
rocblas_datatype_i32_r
;
}
auto
compute_type
=
output_type
;
if
(
compute_fp32
)
gemm_impl
(
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
input_shapes
,
T
alpha_param
,
T
beta_param
,
bool
compute_fp32_flag
)
:
alpha
(
alpha_param
),
beta
(
beta_param
),
is_3inputs
(
input_shapes
.
size
()
==
4
),
compute_fp32
(
compute_fp32_flag
)
{
if
(
arg_type
==
rocblas_datatype_f16_r
)
compute_type
=
rocblas_datatype_f32_r
;
}
if
(
not
is_3inputs
)
{
beta
=
0
;
}
rocblas_gemm_flags
flag
=
rocblas_gemm_flags_none
;
auto
a_lens
=
args
[
0
].
get_shape
().
lens
();
auto
b_lens
=
args
[
1
].
get_shape
().
lens
();
output_shape
.
visit_type
([
&
](
auto
as
)
{
auto
alpha_r
=
as
(
alpha
);
auto
beta_r
=
as
(
beta
);
// Create lambdas that will cast alpha, beta to the output shape's type
// and retain the values being pointed to
output_shape
.
visit_type
([
&
](
auto
as
)
{
auto
alpha_r
=
as
(
alpha
);
auto
beta_r
=
as
(
beta
);
if
(
compute_fp32
)
{
get_alpha
=
[
=
]
{
return
&
alpha
;
};
get_beta
=
[
=
]
{
return
&
beta
;
};
}
else
{
get_alpha
=
[
=
]
{
return
&
alpha_r
;
};
get_beta
=
[
=
]
{
return
&
beta_r
;
};
}
});
// use void pointer to select different data type if using fp32 mode
void
*
alpha_v
=
&
alpha_r
;
void
*
beta_v
=
&
beta_r
;
transa
=
is_transposed
(
input_shapes
[
0
]);
transb
=
is_transposed
(
input_shapes
[
1
]);
auto
n_dim
=
output_shape
.
lens
().
size
();
auto
dim_0
=
n_dim
-
2
;
auto
dim_1
=
n_dim
-
1
;
// Leading dimensions of matrices
lda
=
input_shapes
[
0
].
strides
()[
transa
?
dim_1
:
dim_0
];
ldb
=
input_shapes
[
1
].
strides
()[
transb
?
dim_1
:
dim_0
];
ldc
=
input_shapes
[
2
].
strides
()[
dim_0
];
ldd
=
is_3inputs
?
input_shapes
[
3
].
strides
()[
dim_0
]
:
ldc
;
arg_type
=
get_type
(
input_shapes
[
0
].
type
());
output_type
=
arg_type
;
if
(
output_type
==
rocblas_datatype_i8_r
)
{
output_type
=
rocblas_datatype_i32_r
;
}
compute_type
=
output_type
;
if
(
compute_fp32
)
{
alpha_v
=
&
alpha
;
beta_v
=
&
beta
;
if
(
arg_type
==
rocblas_datatype_f16_r
)
compute_type
=
rocblas_datatype_f32_r
;
}
auto
out_lens
=
output_shape
.
lens
();
rocblas_int
m
=
out_lens
[
dim_0
];
rocblas_int
n
=
out_lens
[
dim_1
];
rocblas_int
k
=
args
[
0
].
get_shape
().
lens
()[
dim_1
];
auto
to_pointer
=
[
&
](
auto
&&
arg
)
{
return
as
.
from
(
arg
.
data
());
};
auto
a_lens
=
input_shapes
[
0
].
lens
();
auto
b_lens
=
input_shapes
[
1
].
lens
();
auto
num_matrices
=
std
::
accumulate
(
auto
out_lens
=
output_shape
.
lens
();
m
=
out_lens
[
dim_0
];
n
=
out_lens
[
dim_1
];
k
=
input_shapes
[
0
].
lens
()[
dim_1
];
a_stride
=
get_batch_stride
(
input_shapes
[
0
]);
b_stride
=
get_batch_stride
(
input_shapes
[
1
]);
c_stride
=
get_batch_stride
(
input_shapes
[
2
]);
d_stride
=
is_3inputs
?
get_batch_stride
(
input_shapes
[
3
])
:
c_stride
;
num_matrices
=
std
::
accumulate
(
out_lens
.
rbegin
()
+
2
,
out_lens
.
rend
(),
std
::
size_t
{
1
},
std
::
multiplies
<
std
::
size_t
>
());
if
(
num_matrices
==
1
or
(
num_matrices
>
1
and
get_batch_stride
(
args
[
1
])
==
0
))
strided_batched
=
num_matrices
>
1
;
if
(
strided_batched
and
b_stride
==
0
and
input_shapes
[
0
].
standard
())
{
// If the batch dimension of B is broadcasted, then we can
// multiply m by the batch_size and use rocblas_gemm_ex
// instead of rocblas_gemm_strided_batched_ex.
m
*=
num_matrices
;
strided_batched
=
false
;
}
}
// the rocblas_gemm API handles inputs and output matrices as
// column-major format. When doing a C = A * B, we actually do
// C^T = (B^T) * (A^T). That is the reason we input args[1] as
// A and args[0] as B in calling the rocblas_gemm.
void
run
(
context
&
ctx
,
const
std
::
vector
<
argument
>&
input_args
,
int32_t
solution_idx
=
0
)
const
{
if
(
strided_batched
)
{
auto
common_args
=
create_strided_batched_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
common_args
,
rocblas_gemm_algo_solution_index
,
solution_idx
,
gemm_flags
);
}
else
{
auto
common_args
=
create_gemm_ex_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_ex
,
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
n
,
m
,
k
,
alpha_v
,
to_pointer
(
args
.
at
(
1
)),
arg_type
,
ldb
,
to_pointer
(
args
.
at
(
0
)),
arg_type
,
lda
,
beta_v
,
to_pointer
(
args
[
2
]),
output_type
,
ldc
,
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
]),
output_type
,
ldd
,
compute_type
,
rocblas_gemm_algo_standard
,
0
,
flag
);
common_args
,
rocblas_gemm_algo_solution_index
,
solution_idx
,
gemm_flags
);
}
}
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
auto
validate
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
input_shapes
,
int32_t
solution_idx
)
const
{
// Create dummy arguments for the shapes, and call the overloaded method
std
::
vector
<
argument
>
input_args
;
std
::
transform
(
input_shapes
.
begin
(),
input_shapes
.
end
(),
std
::
back_inserter
(
input_args
),
[](
const
shape
&
x
)
{
return
to_gpu
(
generate_argument
(
x
));
});
return
validate
(
ctx
,
input_args
,
solution_idx
);
}
/**
* Checks a particular solution for validity by running it with the flag
* rocblas_gemm_flags_check_solution_index (could be invalid if this model was
* tuned with a different rocBLAS version)
*
* @return Returns either solution_idx if valid, or else the default value 0
* if not. The default does not mean list index 0, but tells the picker
* to choose a solution.
*/
int32_t
validate
(
context
&
ctx
,
const
std
::
vector
<
argument
>&
input_args
,
int32_t
solution_idx
)
const
{
rocblas_status_
check_valid
(
rocblas_status_success
);
if
(
strided_batched
)
{
auto
common_args
=
create_strided_batched_args_common
(
ctx
,
input_args
);
check_valid
=
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
common_args
,
rocblas_gemm_algo_solution_index
,
solution_idx
,
rocblas_gemm_flags_check_solution_index
);
}
else
{
auto
a_stride
=
get_batch_stride
(
args
[
0
]);
auto
b_stride
=
get_batch_stride
(
args
[
1
]);
auto
c_stride
=
get_batch_stride
(
args
[
2
]);
auto
d_stride
=
is_3inputs
?
get_batch_stride
(
args
[
3
])
:
c_stride
;
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex
,
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
n
,
m
,
k
,
alpha_v
,
to_pointer
(
args
.
at
(
1
)),
arg_type
,
ldb
,
b_stride
,
to_pointer
(
args
.
at
(
0
)),
arg_type
,
lda
,
a_stride
,
beta_v
,
to_pointer
(
args
[
2
]),
output_type
,
ldc
,
c_stride
,
is_3inputs
?
to_pointer
(
args
[
3
])
:
to_pointer
(
args
[
2
]),
output_type
,
ldd
,
d_stride
,
num_matrices
,
compute_type
,
rocblas_gemm_algo_standard
,
0
,
flag
);
auto
common_args
=
create_gemm_ex_args_common
(
ctx
,
input_args
);
check_valid
=
rocblas_invoke
(
&
rocblas_gemm_ex
,
common_args
,
rocblas_gemm_algo_solution_index
,
solution_idx
,
rocblas_gemm_flags_check_solution_index
);
}
});
if
(
check_valid
==
rocblas_status_invalid_value
)
{
std
::
cerr
<<
"WARNING: tuned solution is invalid; reverting to default"
<<
std
::
endl
;
return
0
;
}
return
solution_idx
;
}
#endif
/**
* Helper method to create that subset of a long rocBLAS argument list that is common
* to multiple "...strided_batched..." calls.
*
* The rocblas_gemm API handles inputs and output matrices as
* column-major format. When doing a C = A * B, we actually do
* C^T = (B^T) * (A^T). That is the reason we input args[1] as
* A and args[0] as B in calling the rocblas_gemm.
*
*/
auto
create_strided_batched_args_common
(
context
&
ctx
,
const
std
::
vector
<
argument
>&
args
)
const
{
return
pack
(
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
n
,
m
,
k
,
get_alpha
(),
args
[
1
].
data
(),
arg_type
,
ldb
,
b_stride
,
args
[
0
].
data
(),
arg_type
,
lda
,
a_stride
,
get_beta
(),
args
[
2
].
data
(),
output_type
,
ldc
,
c_stride
,
is_3inputs
?
args
[
3
].
data
()
:
args
[
2
].
data
(),
output_type
,
ldd
,
d_stride
,
num_matrices
,
compute_type
);
}
/**
* Helper method to create that subset of a long rocBLAS argument list that is common
* to multiple "gemm_ex..." calls.
*
* The rocblas_gemm API handles inputs and output matrices as
* column-major format. When doing a C = A * B, we actually do
* C^T = (B^T) * (A^T). That is the reason we input args[1] as
* A and args[0] as B in calling the rocblas_gemm.
*
* */
auto
create_gemm_ex_args_common
(
context
&
ctx
,
const
std
::
vector
<
argument
>&
args
)
const
{
return
pack
(
ctx
.
get_stream
().
get_rocblas
(),
transb
?
rocblas_operation_transpose
:
rocblas_operation_none
,
transa
?
rocblas_operation_transpose
:
rocblas_operation_none
,
n
,
m
,
k
,
get_alpha
(),
args
[
1
].
data
(),
arg_type
,
ldb
,
args
[
0
].
data
(),
arg_type
,
lda
,
get_beta
(),
args
[
2
].
data
(),
output_type
,
ldc
,
is_3inputs
?
args
[
3
].
data
()
:
args
[
2
].
data
(),
output_type
,
ldd
,
compute_type
);
}
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
/**
* Find best rocBLAS solution: Get list of solutions and try them all, returning the index
* of the fastest one.
*/
int
tune
(
context
&
ctx
,
const
std
::
vector
<
shape
>&
input_shapes
)
const
{
// tuning meta parameters
const
int
hot_calls
=
40
;
std
::
vector
<
argument
>
input_args
;
std
::
transform
(
input_shapes
.
begin
(),
input_shapes
.
end
(),
std
::
back_inserter
(
input_args
),
[](
const
shape
&
x
)
{
return
to_gpu
(
generate_argument
(
x
));
});
// Get the solutions list in 2 rocBLAS steps:
// 1. Find out how many solutions there are and allocate the array
// 2. Get the solutions
//
rocblas_int
list_size
=
0
;
std
::
vector
<
rocblas_int
>
solution_indices
;
if
(
strided_batched
)
{
auto
common_args
=
create_strided_batched_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex_get_solutions
,
common_args
,
rocblas_gemm_algo_solution_index
,
gemm_flags
,
nullptr
,
&
list_size
);
solution_indices
.
resize
(
list_size
);
auto
common_sol_args
=
create_strided_batched_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_strided_batched_ex_get_solutions
,
common_sol_args
,
rocblas_gemm_algo_solution_index
,
gemm_flags
,
solution_indices
.
data
(),
&
list_size
);
}
else
{
auto
common_args
=
create_gemm_ex_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_ex_get_solutions
,
common_args
,
rocblas_gemm_algo_solution_index
,
gemm_flags
,
nullptr
,
&
list_size
);
solution_indices
.
resize
(
list_size
);
auto
common_sol_args
=
create_gemm_ex_args_common
(
ctx
,
input_args
);
rocblas_invoke
(
&
rocblas_gemm_ex_get_solutions
,
common_sol_args
,
rocblas_gemm_algo_solution_index
,
gemm_flags
,
solution_indices
.
data
(),
&
list_size
);
}
double
best_time
=
std
::
numeric_limits
<
double
>::
max
();
double
first_time
=
-
1
;
// Initialize to default solution index
rocblas_int
best_sol
=
0
;
for
(
auto
sol
:
solution_indices
)
{
// Warmup: the first call to an op. may not be representative since there is
// more time taken initializing caches, etc. so we won't time it.
run
(
ctx
,
input_args
,
sol
);
double
host_time
=
time
<
milliseconds
>
([
&
]
{
for
([[
maybe_unused
]]
int
hc
:
range
(
hot_calls
))
run
(
ctx
,
input_args
,
sol
);
ctx
.
finish
();
});
host_time
/=
hot_calls
;
// dev/evaluation only: track time for first solution.
if
(
first_time
<
0
)
first_time
=
host_time
;
// track current best
if
(
host_time
<
best_time
)
{
best_sol
=
sol
;
best_time
=
host_time
;
}
}
std
::
cout
<<
"Winning GEMM solution: "
<<
best_sol
<<
" in "
<<
best_time
<<
" ms, beats "
<<
first_time
<<
"ms"
<<
std
::
endl
;
return
best_sol
;
}
#endif
private:
size_t
num_matrices
=
0
;
rocblas_int
m
=
0
;
rocblas_int
n
=
0
;
rocblas_int
k
=
0
;
bool
transa
=
false
;
bool
transb
=
false
;
T
alpha
=
0
;
T
beta
=
0
;
std
::
function
<
const
void
*
()
>
get_alpha
{};
std
::
function
<
const
void
*
()
>
get_beta
{};
rocblas_gemm_flags
gemm_flags
=
rocblas_gemm_flags_none
;
rocblas_int
lda
=
0
;
rocblas_int
ldb
=
0
;
rocblas_int
ldc
=
0
;
rocblas_int
ldd
=
0
;
rocblas_int
a_stride
=
0
;
rocblas_int
b_stride
=
0
;
rocblas_int
c_stride
=
0
;
rocblas_int
d_stride
=
0
;
rocblas_datatype
compute_type
=
rocblas_datatype_f32_r
;
rocblas_datatype
arg_type
=
rocblas_datatype_f32_r
;
rocblas_datatype
output_type
=
rocblas_datatype_f32_r
;
bool
strided_batched
=
true
;
bool
is_3inputs
=
true
;
bool
compute_fp32
=
true
;
};
// gemm_impl
void
gemm_compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
float
alpha
,
float
beta
,
bool
compute_fp32
,
int32_t
solution_idx
)
{
std
::
vector
<
shape
>
input_shapes
;
std
::
transform
(
args
.
begin
(),
args
.
end
(),
std
::
back_inserter
(
input_shapes
),
[](
const
argument
&
x
)
{
return
x
.
get_shape
();
});
auto
gemm_item
=
gemm_impl
<
float
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
gemm_item
.
run
(
ctx
,
args
,
solution_idx
);
}
void
gemm
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
float
alpha
,
float
beta
,
bool
compute_fp32
)
void
gemm_compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
int32_t
alpha
,
int32_t
beta
,
bool
compute_fp32
,
int32_t
solution_idx
)
{
gemm_impl
(
ctx
,
output_shape
,
args
,
alpha
,
beta
,
compute_fp32
);
std
::
vector
<
shape
>
input_shapes
;
std
::
transform
(
args
.
begin
(),
args
.
end
(),
std
::
back_inserter
(
input_shapes
),
[](
const
argument
&
x
)
{
return
x
.
get_shape
();
});
auto
gemm_item
=
gemm_impl
<
int32_t
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
gemm_item
.
run
(
ctx
,
args
,
solution_idx
);
}
/**
* Decides if the tune() or validate() method is appropriate and calls it.
* Return value is the chosen solution index, or 0 to let picker choose it.
*/
int32_t
gemm_finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
input_shapes
,
float
alpha
,
float
beta
,
bool
compute_fp32
,
int32_t
solution_idx
)
{
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
// This code should be called only if either the environment var.
// MIGRAPHX_ENABLE_GEMM_TUNING, or option --exhaustive-tune, is set
if
(
solution_idx
==
0
)
{
auto
gemm_item
=
gemm_impl
<
float
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
solution_idx
=
gemm_item
.
tune
(
ctx
,
input_shapes
);
}
else
{
// If a tuned solution index is already given, don't tune again but validate
// in case the data was tuned with a different rocBLAS version
auto
gemm_item
=
gemm_impl
<
float
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
solution_idx
=
gemm_item
.
validate
(
ctx
,
input_shapes
,
solution_idx
);
}
#else
(
void
)
ctx
,
(
void
)
output_shape
,
(
void
)
input_shapes
;
(
void
)
alpha
,
(
void
)
beta
,
(
void
)
compute_fp32
;
#endif
return
solution_idx
;
}
void
gemm
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
int32_t
alpha
,
int32_t
beta
,
bool
compute_fp32
)
/**
* Decides if the tune() or validate() method is appropriate and calls it.
* Return value is the chosen solution index, or 0 to let picker choose it.
*/
int32_t
gemm_finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
input_shapes
,
int32_t
alpha
,
int32_t
beta
,
bool
compute_fp32
,
int32_t
solution_idx
)
{
gemm_impl
(
ctx
,
output_shape
,
args
,
alpha
,
beta
,
compute_fp32
);
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
if
(
solution_idx
==
0
)
{
auto
gemm_item
=
gemm_impl
<
int32_t
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
solution_idx
=
gemm_item
.
tune
(
ctx
,
input_shapes
);
}
else
{
// If a tuned solution index is already given, don't tune again but validate
// in case the data was tuned with a different rocBLAS version
auto
gemm_item
=
gemm_impl
<
int32_t
>
(
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
);
solution_idx
=
gemm_item
.
validate
(
ctx
,
input_shapes
,
solution_idx
);
}
#else
(
void
)
ctx
,
(
void
)
output_shape
,
(
void
)
input_shapes
;
(
void
)
alpha
,
(
void
)
beta
,
(
void
)
compute_fp32
;
#endif
return
solution_idx
;
}
}
// namespace gpu
...
...
src/targets/gpu/include/migraphx/gpu/gemm.hpp
View file @
a6bde7c0
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
...
...
@@ -40,9 +40,8 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace
gpu
{
struct
context
;
void
blas_shape
(
const
shape
&
s
);
shape
transpose_batch
(
const
shape
&
s
,
unsigned
trans_batch
);
void
blas_shape
(
const
shape
&
s
);
template
<
class
Op
>
struct
rocblas_gemm
...
...
@@ -52,6 +51,7 @@ struct rocblas_gemm
float
beta
=
0
;
bool
compute_fp32
=
false
;
unsigned
trans_batch
=
0
;
int32_t
solution_idx
=
0
;
template
<
class
Self
,
class
F
>
static
auto
reflect
(
Self
&
self
,
F
f
)
...
...
@@ -60,7 +60,8 @@ struct rocblas_gemm
pack
(
f
(
self
.
alpha
,
"alpha"
),
f
(
self
.
beta
,
"beta"
),
f
(
self
.
compute_fp32
,
"compute_fp32"
),
f
(
self
.
trans_batch
,
"trans_batch"
)));
f
(
self
.
trans_batch
,
"trans_batch"
),
f
(
self
.
solution_idx
,
"solution_idx"
)));
}
std
::
string
name
()
const
...
...
@@ -76,6 +77,8 @@ struct rocblas_gemm
{
std
::
vector
<
shape
>
in_shapes
(
inputs
);
in_shapes
.
pop_back
();
// When input shapes are A, B, C the GEMM equation is C = α AB+ β C where α, β are
// scalars
check_shapes
{
in_shapes
,
*
this
}.
has
(
2
,
3
);
blas_shape
(
inputs
[
0
]);
blas_shape
(
inputs
[
1
]);
...
...
@@ -111,11 +114,12 @@ struct rocblas_gemm
{
if
(
this
->
name
()
==
"gpu::gemm"
)
{
gemm
(
ctx
,
output_shape
,
args
,
alpha
,
beta
,
compute_fp32
);
gemm
_compute
(
ctx
,
output_shape
,
args
,
alpha
,
beta
,
compute_fp32
,
solution_idx
);
}
else
{
gemm
(
ctx
,
output_shape
,
args
,
int32_t
(
alpha
),
int32_t
(
beta
),
compute_fp32
);
gemm_compute
(
ctx
,
output_shape
,
args
,
int32_t
(
alpha
),
int32_t
(
beta
),
compute_fp32
,
solution_idx
);
}
return
args
.
back
();
}
...
...
@@ -124,6 +128,33 @@ struct rocblas_gemm
{
return
shapes
.
size
()
-
1
;
}
void
finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
input_shapes
)
{
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
if
(
enabled
(
MIGRAPHX_ENABLE_GEMM_TUNING
{})
or
ctx
.
get_exhaustive_tune_flag
())
{
if
(
this
->
name
()
==
"gpu::gemm"
)
{
solution_idx
=
gemm_finalize
(
ctx
,
output_shape
,
input_shapes
,
alpha
,
beta
,
compute_fp32
,
solution_idx
);
}
else
{
solution_idx
=
gemm_finalize
(
ctx
,
output_shape
,
input_shapes
,
int32_t
(
alpha
),
int32_t
(
beta
),
compute_fp32
,
solution_idx
);
}
}
#else
// suppress compiler warnings
(
void
)
ctx
,
(
void
)
output_shape
,
(
void
)
input_shapes
;
#endif
}
};
}
// namespace gpu
...
...
src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
View file @
a6bde7c0
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
...
...
@@ -24,26 +24,64 @@
#ifndef MIGRAPHX_GUARD_RTGLIB_GEMM_IMPL_HPP
#define MIGRAPHX_GUARD_RTGLIB_GEMM_IMPL_HPP
#include <iterator>
#include <migraphx/shape.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/gpu/context.hpp>
// Set this environment variable to "true" to perform GEMM tuning even when the
// --exhaustive-tune option isn't set. Can be used to skip slow convolution tuning.
MIGRAPHX_DECLARE_ENV_VAR
(
MIGRAPHX_ENABLE_GEMM_TUNING
);
using
milliseconds
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
;
using
microseconds
=
std
::
chrono
::
duration
<
double
,
std
::
micro
>
;
namespace
migraphx
{
inline
namespace
MIGRAPHX_INLINE_NS
{
namespace
gpu
{
void
gemm
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
float
alpha
,
float
beta
,
bool
compute_fp32
);
void
gemm
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
int32_t
alpha
,
int32_t
beta
,
bool
compute_fp32
);
/**
* @brief Templated implementations of the compute() and finalize() methods of the Gemm operator.
* For each function there are overloads using either float or int32_t for the arguments
* alpha and beta.
*
* @param ctx .
* @param output_shape .
* @param args .
* @param alpha .
* @param beta .
* @param compute_fp32 .
*/
void
gemm_compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
float
alpha
,
float
beta
,
bool
compute_fp32
,
int32_t
solution_idx
);
void
gemm_compute
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
argument
>&
args
,
int32_t
alpha
,
int32_t
beta
,
bool
compute_fp32
,
int32_t
solution_idx
);
int32_t
gemm_finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
input_shapes
,
float
alpha
,
float
beta
,
bool
compute_fp32
);
int32_t
gemm_finalize
(
context
&
ctx
,
const
shape
&
output_shape
,
const
std
::
vector
<
shape
>&
input_shapes
,
int32_t
alpha
,
int32_t
beta
,
bool
compute_fp32
,
int32_t
solution_idx
);
}
// namespace gpu
}
// namespace MIGRAPHX_INLINE_NS
...
...
src/targets/gpu/include/migraphx/gpu/rocblas.hpp
View file @
a6bde7c0
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-202
2
Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-202
3
Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
...
...
src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
View file @
a6bde7c0
...
...
@@ -101,7 +101,9 @@ MIGRAPHX_DEVICE_MATH(erf, ::erf)
MIGRAPHX_DEVICE_MATH
(
exp
,
::
exp
)
MIGRAPHX_DEVICE_MATH
(
floor
,
::
floor
)
MIGRAPHX_DEVICE_MATH
(
isnan
,
::
isnan
)
MIGRAPHX_DEVICE_MATH
(
isinf
,
::
isinf
)
MIGRAPHX_DEVICE_MATH
(
log
,
::
log
)
MIGRAPHX_DEVICE_MATH
(
nearbyint
,
::
nearbyint
)
MIGRAPHX_DEVICE_MATH
(
pow
,
::
pow
)
MIGRAPHX_DEVICE_MATH
(
remainder
,
::
remainder
)
MIGRAPHX_DEVICE_MATH
(
round
,
::
round
)
...
...
@@ -135,6 +137,7 @@ MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, ceil, ::hceil)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
cos
,
::
hcos
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
exp
,
::
hexp
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
floor
,
::
hfloor
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
isinf
,
::
__hisinf
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
isnan
,
::
__hisnan
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
log
,
::
hlog
)
MIGRAPHX_DEVICE_MATH_FOR
(
migraphx
::
half
,
rsqrt
,
::
hrsqrt
)
...
...
@@ -150,6 +153,7 @@ MIGRAPHX_DEVICE_MATH_HALF(atan, ::atan)
MIGRAPHX_DEVICE_MATH_HALF
(
atanh
,
::
atanh
)
MIGRAPHX_DEVICE_MATH_HALF
(
cosh
,
::
cosh
)
MIGRAPHX_DEVICE_MATH_HALF
(
erf
,
::
erf
)
MIGRAPHX_DEVICE_MATH_HALF
(
nearbyint
,
::
nearbyint
)
MIGRAPHX_DEVICE_MATH_HALF
(
pow
,
::
pow
)
MIGRAPHX_DEVICE_MATH_HALF
(
remainder
,
::
remainder
)
MIGRAPHX_DEVICE_MATH_HALF
(
round
,
::
round
)
...
...
@@ -229,10 +233,12 @@ MIGRAPHX_DEVICE_MATH_VEC(erf)
MIGRAPHX_DEVICE_MATH_VEC
(
exp
)
MIGRAPHX_DEVICE_MATH_VEC
(
floor
)
MIGRAPHX_DEVICE_MATH_VEC
(
fmod
)
MIGRAPHX_DEVICE_MATH_VEC
(
isinf
)
MIGRAPHX_DEVICE_MATH_VEC
(
isnan
)
MIGRAPHX_DEVICE_MATH_VEC
(
log
)
MIGRAPHX_DEVICE_MATH_VEC
(
max
)
MIGRAPHX_DEVICE_MATH_VEC
(
min
)
MIGRAPHX_DEVICE_MATH_VEC
(
nearbyint
)
MIGRAPHX_DEVICE_MATH_VEC
(
pow
)
MIGRAPHX_DEVICE_MATH_VEC
(
remainder
)
MIGRAPHX_DEVICE_MATH_VEC
(
round
)
...
...
src/version.h.in
View file @
a6bde7c0
...
...
@@ -25,5 +25,5 @@
#define MIGRAPHX_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
#define MIGRAPHX_VERSION_MINOR @PROJECT_VERSION_MINOR@
#define MIGRAPHX_VERSION_PATCH @PROJECT_VERSION_PATCH@
#define MIGRAPHX_VERSION_TWEAK @PROJECT_VERSION_TWEAK@
#define MIGRAPHX_VERSION_TWEAK
"
@PROJECT_VERSION_TWEAK@
"
// clang-format on
test/api/CMakeLists.txt
View file @
a6bde7c0
...
...
@@ -30,6 +30,9 @@ function(add_api_test TEST_NAME TEST_SRC TEST_DIR)
add_test
(
NAME
${
NAME
}
COMMAND $<TARGET_FILE:
${
NAME
}
> WORKING_DIRECTORY
${
TEST_DIR
}
)
add_dependencies
(
tests
${
NAME
}
)
add_dependencies
(
check
${
NAME
}
)
if
(
WIN32
)
target_compile_definitions
(
${
NAME
}
PRIVATE _CRT_SECURE_NO_WARNINGS
)
endif
()
endfunction
()
# Workaround: C file dont work with clang-tidy right now, need a fix in rocm-cmake
...
...
@@ -41,6 +44,9 @@ function(add_c_api_test TEST_NAME TEST_SRC TEST_DIR)
add_test
(
NAME
${
NAME
}
COMMAND $<TARGET_FILE:
${
NAME
}
> WORKING_DIRECTORY
${
TEST_DIR
}
)
add_dependencies
(
tests
${
NAME
}
)
add_dependencies
(
check
${
NAME
}
)
if
(
WIN32
)
target_compile_definitions
(
${
NAME
}
PRIVATE _CRT_SECURE_NO_WARNINGS
)
endif
()
endfunction
()
add_api_test
(
array_base test_array_base.cpp
${
TEST_ONNX_DIR
}
)
...
...
@@ -57,10 +63,6 @@ add_api_test(custom_op test_custom_op.cpp ${TEST_ONNX_DIR})
add_api_test
(
tf_parser test_tf_parser.cpp
${
TEST_TF_DIR
}
)
# GPU-based tests
if
(
MIGRAPHX_ENABLE_GPU
)
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm
)
find_package
(
hip
)
add_api_test
(
gpu test_gpu.cpp
${
TEST_ONNX_DIR
}
)
target_link_libraries
(
test_api_gpu
)
add_api_test
(
custom_op_gpu test_custom_op_gpu.cpp
${
TEST_ONNX_DIR
}
)
target_link_libraries
(
test_api_custom_op_gpu
)
endif
()
test/api/test_cpu.cpp
View file @
a6bde7c0
...
...
@@ -198,4 +198,29 @@ TEST_CASE(set_loop_default_iter_num)
EXPECT
(
out_shapes
[
1
].
lengths
()
==
out_lens1
);
}
TEST_CASE
(
set_loop_limit_iterations
)
{
migraphx
::
onnx_options
option
;
option
.
set_default_loop_iterations
(
15
);
option
.
set_limit_loop_iterations
(
10
);
auto
p
=
migraphx
::
parse_onnx
(
"loop_default_test.onnx"
,
option
);
auto
out_shapes
=
p
.
get_output_shapes
();
std
::
vector
<
std
::
size_t
>
out_lens0
=
{
1
};
EXPECT
(
out_shapes
[
0
].
lengths
()
==
out_lens0
);
std
::
vector
<
std
::
size_t
>
out_lens1
=
{
10
,
1
};
EXPECT
(
out_shapes
[
1
].
lengths
()
==
out_lens1
);
}
TEST_CASE
(
set_loop_limit_iterations2
)
{
migraphx
::
onnx_options
option
;
option
.
set_limit_loop_iterations
(
10
);
auto
p
=
migraphx
::
parse_onnx
(
"loop_test_implicit_tripcnt.onnx"
,
option
);
auto
out_shapes
=
p
.
get_output_shapes
();
std
::
vector
<
std
::
size_t
>
out_lens0
=
{
1
};
EXPECT
(
out_shapes
[
0
].
lengths
()
==
out_lens0
);
std
::
vector
<
std
::
size_t
>
out_lens1
=
{
10
,
1
};
EXPECT
(
out_shapes
[
1
].
lengths
()
==
out_lens1
);
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
test/api/test_gpu.cpp
View file @
a6bde7c0
...
...
@@ -317,4 +317,59 @@ TEST_CASE(loop_test)
}
}
TEST_CASE
(
loop_test_limit_max_iter
)
{
auto
run_prog
=
[
&
](
int64_t
limit_max_iterations
)
{
migraphx
::
onnx_options
parse_options
;
parse_options
.
set_limit_loop_iterations
(
limit_max_iterations
);
auto
p
=
migraphx
::
parse_onnx
(
"loop_test_implicit_tripcnt.onnx"
,
parse_options
);
auto
shapes_before
=
p
.
get_output_shapes
();
migraphx
::
compile_options
options
;
options
.
set_offload_copy
();
p
.
compile
(
migraphx
::
target
(
"gpu"
),
options
);
auto
shapes_after
=
p
.
get_output_shapes
();
CHECK
(
shapes_before
.
size
()
==
2
);
CHECK
(
bool
{
shapes_before
.
front
()
==
shapes_after
.
front
()});
migraphx
::
program_parameters
pp
;
auto
param_shapes
=
p
.
get_parameter_shapes
();
auto
aas
=
param_shapes
[
"a"
];
std
::
vector
<
float
>
xd
=
{
1.0
f
};
pp
.
add
(
"a"
,
migraphx
::
argument
(
aas
,
xd
.
data
()));
auto
bbs
=
param_shapes
[
"b"
];
std
::
vector
<
float
>
yd
=
{
2.0
};
pp
.
add
(
"b"
,
migraphx
::
argument
(
bbs
,
yd
.
data
()));
auto
cs
=
param_shapes
[
"keep_going_cond"
];
bool
cond
=
true
;
pp
.
add
(
"keep_going_cond"
,
migraphx
::
argument
(
cs
,
&
cond
));
auto
outputs
=
p
.
eval
(
pp
);
auto
output
=
outputs
[
0
];
std
::
vector
<
std
::
vector
<
float
>>
ret
;
ret
.
push_back
(
output
.
as_vector
<
float
>
());
output
=
outputs
[
1
];
ret
.
push_back
(
output
.
as_vector
<
float
>
());
return
ret
;
};
{
auto
result_vector
=
run_prog
(
5
);
std
::
vector
<
float
>
gold0
=
{
2.0
f
};
EXPECT
(
result_vector
.
at
(
0
)
==
gold0
);
std
::
vector
<
float
>
gold1
=
{
-
2
,
4
,
0
,
0
,
0
};
EXPECT
(
result_vector
.
at
(
1
)
==
gold1
);
}
{
auto
result_vector
=
run_prog
(
20
);
std
::
vector
<
float
>
gold0
=
{
2.0
f
};
EXPECT
(
result_vector
.
at
(
0
)
==
gold0
);
std
::
vector
<
float
>
gold1
=
{
-
2
,
4
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
EXPECT
(
result_vector
.
at
(
1
)
==
gold1
);
}
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
test/gpu/codegen_literal.cpp
View file @
a6bde7c0
...
...
@@ -64,7 +64,7 @@ TEST_CASE(mul_literal_round_test)
auto
l1
=
mm
->
add_literal
(
1
/
0.00787402
f
);
auto
mul
=
mm
->
add_instruction
(
migraphx
::
make_op
(
"mul"
),
l0
,
l1
);
auto
round
=
mm
->
add_instruction
(
migraphx
::
make_op
(
"
round
"
),
mul
);
auto
round
=
mm
->
add_instruction
(
migraphx
::
make_op
(
"
nearbyint
"
),
mul
);
mm
->
add_return
({
round
});
...
...
test/gpu/gemm_tune.cpp
0 → 100644
View file @
a6bde7c0
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <iostream>
#include <vector>
#include <migraphx/gpu/gemm.hpp>
#include <hip/hip_runtime_api.h>
#include <migraphx/gpu/target.hpp>
#include <migraphx/verify.hpp>
#include <test.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/iterator_for.hpp>
// includes needed for run_lowering
#include <migraphx/gpu/lowering.hpp>
#include <migraphx/auto_contiguous.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/pass_manager.hpp>
// Abbreviated lowering; we don't need the usual cleanup passes for this test
void
run_lowering
(
migraphx
::
program
&
p
,
bool
offload_copy
=
false
)
{
auto
ctx
=
migraphx
::
gpu
::
context
{};
migraphx
::
run_passes
(
*
p
.
get_main_module
(),
{
migraphx
::
auto_contiguous
{},
migraphx
::
gpu
::
lowering
{
&
ctx
,
offload_copy
}});
}
/**
* Tests the automatic GEMM tuning feature. In the finalize() method of the gemm op,
* rocBLAS API functions are called to quickly benchmark all the GEMM solutions
* available in the currently installed rocBLAS library and choose the index of the fastest.
*/
TEST_CASE
(
gemm_tune_with_rocblas
)
{
migraphx
::
program
p
;
auto
*
mm
=
p
.
get_main_module
();
migraphx
::
shape
sa
{
migraphx
::
shape
::
float_type
,
{
4
,
2
}};
migraphx
::
shape
sb
{
migraphx
::
shape
::
float_type
,
{
2
,
3
}};
auto
a
=
mm
->
add_parameter
(
"a"
,
sa
);
auto
b
=
mm
->
add_parameter
(
"b"
,
sb
);
migraphx
::
operation
dot_op
=
migraphx
::
make_op
(
"dot"
);
mm
->
add_instruction
(
dot_op
,
a
,
b
);
// lowering adds gemm implementation for dot operator
run_lowering
(
p
);
migraphx
::
target
gpu_t
=
migraphx
::
gpu
::
target
{};
migraphx
::
compile_options
options
;
options
.
exhaustive_tune
=
true
;
p
.
compile
(
gpu_t
,
options
);
migraphx
::
value
solution_idx
(
0
);
for
(
auto
ins
:
iterator_for
(
*
p
.
get_main_module
()))
{
if
(
ins
->
name
()
==
"gpu::gemm"
)
{
auto
gemm_op
=
migraphx
::
get_operation
(
ins
);
// tuned solution index is not deterministic, but anything other than 0
// (default, invalid, or not available) is good.
// gemm_op.to_value().debug_print();
solution_idx
=
gemm_op
.
to_value
()[
"solution_idx"
];
break
;
}
}
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
EXPECT
(
0
!=
solution_idx
.
to
<
std
::
size_t
>
());
#else
EXPECT
(
0
==
solution_idx
.
to
<
std
::
size_t
>
());
#endif
}
// GEMM tuning of a strided-batch matrix; invokes rocblas_gemm_strided_batched_ex
TEST_CASE
(
gemm_tune_strided
)
{
migraphx
::
program
p
;
auto
*
mm
=
p
.
get_main_module
();
migraphx
::
shape
sa
{
migraphx
::
shape
::
float_type
,
{
4
,
2
,
2
}};
migraphx
::
shape
sb
{
migraphx
::
shape
::
float_type
,
{
4
,
2
,
2
}};
migraphx
::
shape
s_output
{
migraphx
::
shape
::
float_type
,
{
4
,
2
,
2
}};
auto
a
=
mm
->
add_parameter
(
"a"
,
sa
);
auto
b
=
mm
->
add_parameter
(
"b"
,
sb
);
auto
output
=
mm
->
add_parameter
(
"out"
,
s_output
);
auto
gemm_oper
=
migraphx
::
make_op
(
"gpu::gemm"
,
{{
"beta"
,
2
}});
mm
->
add_instruction
(
gemm_oper
,
a
,
b
,
output
);
migraphx
::
target
gpu_t
=
migraphx
::
gpu
::
target
{};
migraphx
::
compile_options
options
;
options
.
exhaustive_tune
=
true
;
p
.
compile
(
gpu_t
,
options
);
migraphx
::
value
solution_idx
(
0
);
for
(
auto
ins
:
iterator_for
(
*
p
.
get_main_module
()))
{
if
(
ins
->
name
()
==
"gpu::gemm"
)
{
auto
gemm_op
=
migraphx
::
get_operation
(
ins
);
auto
gemmv
=
gemm_op
.
to_value
();
// tuned solution index is not deterministic, but anything other than 0
// (default, invalid, or not available) is good.
solution_idx
=
gemm_op
.
to_value
()[
"solution_idx"
];
break
;
}
}
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
EXPECT
(
0
!=
solution_idx
.
to
<
std
::
size_t
>
());
#else
EXPECT
(
0
==
solution_idx
.
to
<
std
::
size_t
>
());
#endif
}
// GEMM tuning of a strided-batch matrix; created by lowering
TEST_CASE
(
gemm_tune_strided_lowered
)
{
migraphx
::
program
p
;
auto
*
mm
=
p
.
get_main_module
();
// At time of writing this test, gemm_impl considers a shape is strided if it has
// at least three dimensions and the 3rd-to-last is nonzero, invoking
// rocblas_gemm_strided_batched_ex. Also, DOT operator requires all dimensions except the last
// two to be equal.
migraphx
::
shape
sa
{
migraphx
::
shape
::
float_type
,
{
4
,
2
,
5
}};
migraphx
::
shape
sb
{
migraphx
::
shape
::
float_type
,
{
4
,
5
,
3
}};
auto
a
=
mm
->
add_parameter
(
"a"
,
sa
);
auto
b
=
mm
->
add_parameter
(
"b"
,
sb
);
migraphx
::
operation
dot_op
=
migraphx
::
make_op
(
"dot"
);
mm
->
add_instruction
(
dot_op
,
a
,
b
);
// lowering adds gemm implementation for dot operator
run_lowering
(
p
);
migraphx
::
target
gpu_t
=
migraphx
::
gpu
::
target
{};
migraphx
::
compile_options
options
;
options
.
exhaustive_tune
=
true
;
p
.
compile
(
gpu_t
,
options
);
migraphx
::
value
solution_idx
(
0
);
for
(
auto
ins
:
iterator_for
(
*
p
.
get_main_module
()))
{
if
(
ins
->
name
()
==
"gpu::gemm"
)
{
auto
gemm_op
=
migraphx
::
get_operation
(
ins
);
// tuned solution index is not deterministic, but anything other than 0
// (default, invalid, or not available) is good.
solution_idx
=
gemm_op
.
to_value
()[
"solution_idx"
];
break
;
}
}
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
EXPECT
(
0
!=
solution_idx
.
to
<
std
::
size_t
>
());
#else
EXPECT
(
0
==
solution_idx
.
to
<
std
::
size_t
>
());
#endif
}
TEST_CASE
(
gemm_tune_invalid_sol_index
)
{
migraphx
::
program
p
;
auto
*
mm
=
p
.
get_main_module
();
migraphx
::
shape
sa
{
migraphx
::
shape
::
float_type
,
{
4
,
2
}};
migraphx
::
shape
sb
{
migraphx
::
shape
::
float_type
,
{
2
,
3
}};
migraphx
::
shape
s_output
{
migraphx
::
shape
::
float_type
,
{
4
,
3
}};
auto
a
=
mm
->
add_parameter
(
"a"
,
sa
);
auto
b
=
mm
->
add_parameter
(
"b"
,
sb
);
auto
output
=
mm
->
add_parameter
(
"out"
,
s_output
);
auto
gemm_oper
=
migraphx
::
make_op
(
"gpu::gemm"
,
{{
"solution_idx"
,
987654321
}});
mm
->
add_instruction
(
gemm_oper
,
a
,
b
,
output
);
migraphx
::
target
gpu_t
=
migraphx
::
gpu
::
target
{};
migraphx
::
compile_options
options
;
options
.
exhaustive_tune
=
true
;
p
.
compile
(
gpu_t
,
options
);
migraphx
::
value
solution_idx
(
0
);
for
(
auto
ins
:
iterator_for
(
*
p
.
get_main_module
()))
{
if
(
ins
->
name
()
==
"gpu::gemm"
)
{
auto
gemm_op
=
migraphx
::
get_operation
(
ins
);
auto
gemmv
=
gemm_op
.
to_value
();
// given invalid starting index, should return default 0
solution_idx
=
gemm_op
.
to_value
()[
"solution_idx"
];
break
;
}
}
#ifdef MIGRAPHX_USE_ROCBLAS_TUNING_API
EXPECT
(
0
==
solution_idx
.
to
<
std
::
size_t
>
());
#else
EXPECT
(
0
!=
solution_idx
.
to
<
std
::
size_t
>
());
#endif
}
int
main
(
int
argc
,
const
char
*
argv
[])
{
test
::
run
(
argc
,
argv
);
}
test/onnx/.onnxrt-commit
View file @
a6bde7c0
2eeafc37bca21dc8bf337dda7020b486543162d7
b7b8b5b2ce80edb33990c7ae0fedac6ae3c623f4
test/onnx/gen_onnx.py
View file @
a6bde7c0
...
...
@@ -3858,6 +3858,64 @@ def instance_norm_val_3d_test():
return ([node], [], [y], [x_tensor, scale_tensor, bias_tensor])
@onnx_test()
def isinf_half_test():
t1 = helper.make_tensor_value_info('t1', TensorProto.FLOAT16, [2, 3])
t2 = helper.make_tensor_value_info('t2', TensorProto.BOOL, [2, 3])
node = onnx.helper.make_node(
'IsInf',
inputs=['t1'],
outputs=['t2'],
)
return ([node], [t1], [t2])
@onnx_test()
def isinf_neg_test():
t1 = helper.make_tensor_value_info('t1', TensorProto.FLOAT, [2, 3])
t2 = helper.make_tensor_value_info('t2', TensorProto.BOOL, [2, 3])
node = onnx.helper.make_node(
'IsInf',
detect_negative=[1],
detect_positive=[0],
inputs=['t1'],
outputs=['t2'],
)
return ([node], [t1], [t2])
@onnx_test()
def isinf_double_pos_test():
t1 = helper.make_tensor_value_info('t1', TensorProto.DOUBLE, [2, 3])
t2 = helper.make_tensor_value_info('t2', TensorProto.BOOL, [2, 3])
node = onnx.helper.make_node(
'IsInf',
detect_negative=[0],
detect_positive=[1],
inputs=['t1'],
outputs=['t2'],
)
return ([node], [t1], [t2])
@onnx_test()
def isinf_no_detect_test():
t1 = helper.make_tensor_value_info('t1', TensorProto.FLOAT, [2, 3])
t2 = helper.make_tensor_value_info('t2', TensorProto.BOOL, [2, 3])
node = onnx.helper.make_node(
'IsInf',
detect_negative=[0],
detect_positive=[0],
inputs=['t1'],
outputs=['t2'],
)
return ([node], [t1], [t2])
@onnx_test()
def isnan_float_test():
t1 = helper.make_tensor_value_info('t1', TensorProto.FLOAT, [2, 3])
...
...
@@ -4276,6 +4334,50 @@ def loop_test():
return ([node], [iter, cond, a, b], [b_loop, uout])
@onnx_test()
def loop_test_implicit_tripcnt():
body = helper.make_graph([
helper.make_node("Add", ["a", "b_in"], ["my_local"]),
helper.make_node("Sub", ["a", "b_in"], ["a_sub_b_in"]),
helper.make_node("Greater", ["my_local", "a_sub_b_in"],
["keep_going"]),
helper.make_node("Add", ["a_sub_b_in", "a_sub_b_in"],
["user_defined_vals"]),
], "body", [
helper.make_tensor_value_info('iteration_num', TensorProto.INT64, [1]),
helper.make_tensor_value_info('keep_going_inp', TensorProto.BOOL, [1]),
helper.make_tensor_value_info('b_in', TensorProto.FLOAT, [1])
], [
helper.make_tensor_value_info('keep_going', TensorProto.BOOL, [1]),
helper.make_tensor_value_info('a_sub_b_in', TensorProto.FLOAT, [1]),
helper.make_tensor_value_info('my_local', TensorProto.FLOAT, [1]),
helper.make_tensor_value_info('user_defined_vals', TensorProto.FLOAT,
[1]),
])
iter = helper.make_tensor(name='max_trip_count',
data_type=TensorProto.INT64,
dims=[1],
vals=[15])
node = helper.make_node(
"Loop",
inputs=["max_trip_count", "keep_going_cond", "b"],
outputs=["b_loop", "my_local_loop", "user_defined_vals_loop"],
body=body)
a = helper.make_tensor_value_info('a', TensorProto.FLOAT, [1])
b = helper.make_tensor_value_info('b', TensorProto.FLOAT, [1])
cond = helper.make_tensor_value_info('keep_going_cond', TensorProto.BOOL,
[1])
b_loop = helper.make_tensor_value_info('b_loop', TensorProto.FLOAT, [1])
uout = helper.make_tensor_value_info('user_defined_vals_loop',
TensorProto.FLOAT, [2, 1])
return ([node], [cond, a, b], [b_loop, uout], [iter])
@onnx_test()
def lpnormalization_axis_error_test():
x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 3])
...
...
@@ -4883,9 +4985,9 @@ def mod_test_fmod_different_dtypes():
@onnx_test()
def multinomial_test():
sample_size
=
1
0
seed
=
0.
0
input
=
helper
.
make_tensor_value_info
(
"input"
,
TensorProto
.
FLOAT
,
[
1
,
10
])
sample_size = 1
3
seed = 0.
input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [
3
, 10])
output = helper.make_tensor_value_info("output", TensorProto.INT32,
[1, 10])
...
...
@@ -4898,6 +5000,44 @@ def multinomial_test():
return ([node], [input], [output])
@onnx_test()
def multinomial_dyn_test():
sample_size = 100000
seed = 1.3
categories = 5
input = helper.make_tensor_value_info("input", TensorProto.FLOAT,
[None, categories])
output = helper.make_tensor_value_info("output", TensorProto.FLOAT,
[None, categories])
node = onnx.helper.make_node(
'Multinomial',
inputs=['input'],
sample_size=sample_size,
dtype=1, # shape::float_type
seed=seed,
outputs=['output'])
return ([node], [input], [output])
@onnx_test()
def multinomial_autoseed_dyn_test():
# If seed attribute is not given, device should auto generate one at runtime
sample_size = 12
input = helper.make_tensor_value_info("input", TensorProto.FLOAT,
[None, 10])
output = helper.make_tensor_value_info("output", TensorProto.INT32,
[None, 10])
node = onnx.helper.make_node('Multinomial',
inputs=['input'],
sample_size=sample_size,
outputs=['output'])
return ([node], [input], [output])
@onnx_test()
def multinomial_generated_seed_test():
sample_size = 10
...
...
@@ -6947,6 +7087,16 @@ def roialign_test():
return ([node], [x, roi, bi], [y])
@onnx_test()
def round_half_test():
x = helper.make_tensor_value_info('x', TensorProto.FLOAT16, [4, 4])
y = helper.make_tensor_value_info('y', TensorProto.FLOAT16, [4, 4])
node = onnx.helper.make_node('Round', inputs=['x'], outputs=['y'])
return ([node], [x], [y])
@onnx_test()
def scatter_add_test():
x = helper.make_tensor_value_info('data', TensorProto.FLOAT, [3, 4, 5, 6])
...
...
@@ -7866,6 +8016,32 @@ def slice_var_input_dyn1():
return ([node], [data, starts, ends, axes], [output])
@onnx_test()
def slice_var_input_default_steps():
step = np.array([1, 1])
step_tensor = helper.make_tensor(name="step",
data_type=TensorProto.INT64,
dims=step.shape,
vals=step.astype(int))
arg_step = helper.make_node("Constant",
inputs=[],
outputs=['arg_step'],
value=step_tensor)
data = helper.make_tensor_value_info('data', TensorProto.FLOAT, [None, 2])
starts = helper.make_tensor_value_info('starts', TensorProto.INT64, [2])
ends = helper.make_tensor_value_info('ends', TensorProto.INT64, [2])
axes = helper.make_tensor_value_info('axes', TensorProto.INT64, [2])
output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 2])
node = onnx.helper.make_node(
'Slice',
inputs=['data', 'starts', 'ends', 'axes', 'arg_step'],
outputs=['output'])
return ([arg_step, node], [data, starts, ends, axes], [output])
@onnx_test()
def slice_var_input_steps_error():
step = np.array([2, 1])
...
...
@@ -7879,9 +8055,9 @@ def slice_var_input_steps_error():
value=step_tensor)
data = helper.make_tensor_value_info('data', TensorProto.FLOAT, [3, 2])
starts
=
helper
.
make_tensor_value_info
(
'starts'
,
TensorProto
.
FLOAT
,
[
2
])
ends
=
helper
.
make_tensor_value_info
(
'ends'
,
TensorProto
.
FLOAT
,
[
2
])
axes
=
helper
.
make_tensor_value_info
(
'axes'
,
TensorProto
.
FLOAT
,
[
2
])
starts = helper.make_tensor_value_info('starts', TensorProto.
INT64
, [2])
ends = helper.make_tensor_value_info('ends', TensorProto.
INT64
, [2])
axes = helper.make_tensor_value_info('axes', TensorProto.
INT64
, [2])
output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 2])
node = onnx.helper.make_node(
...
...
@@ -8042,6 +8218,42 @@ def split_test_no_attribute():
return ([const_node, node], [x], [y1, y2, y3, y4])
@onnx_test()
def split_test_uneven():
x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [12, 15])
y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [3, 15])
y5 = helper.make_tensor_value_info('y5', TensorProto.FLOAT, [0, 15])
node = onnx.helper.make_node(
'Split',
inputs=['x'],
outputs=['y1', 'y2', 'y3', 'y4', 'y5'],
)
return ([node], [x], [y1, y2, y3, y4, y5])
@onnx_test()
def split_test_uneven_num_outputs():
x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [11, 15])
y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [2, 15])
node = onnx.helper.make_node(
'Split',
inputs=['x'],
outputs=['y1', 'y2', 'y3', 'y4'],
num_outputs=4,
)
return ([node], [x], [y1, y2, y3, y4])
@onnx_test()
def split_test_no_attribute_invalid_split():
x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [300, 15])
...
...
@@ -8101,6 +8313,24 @@ def split_test_no_attribute_invalid_input_split():
return ([node], [x], [y1, y2, y3])
@onnx_test()
def split_test_invalid_num_outputs():
x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [11, 15])
y1 = helper.make_tensor_value_info('y1', TensorProto.FLOAT, [3, 15])
y2 = helper.make_tensor_value_info('y2', TensorProto.FLOAT, [3, 15])
y3 = helper.make_tensor_value_info('y3', TensorProto.FLOAT, [3, 15])
y4 = helper.make_tensor_value_info('y4', TensorProto.FLOAT, [2, 15])
node = onnx.helper.make_node(
'Split',
inputs=['x'],
outputs=['y1', 'y2', 'y3', 'y4'],
num_outputs=5,
)
return ([node], [x], [y1, y2, y3, y4])
@onnx_test()
def sqrt_test():
x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 15])
...
...
@@ -8837,6 +9067,20 @@ def upsample_test():
return ([node], [X], [Y], [scale_tensor])
@onnx_test()
def upsample_ver7_test():
X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 1, 2, 2])
Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [1, 1, 4, 6])
node = onnx.helper.make_node('Upsample',
inputs=['X'],
outputs=['Y'],
mode='nearest',
scales=[1.0, 1.0, 2.0, 3.0])
return ([node], [X], [Y])
@onnx_test()
def variable_batch_test():
x = helper.make_tensor_value_info('0', TensorProto.FLOAT,
...
...
test/onnx/isinf_double_pos_test.onnx
0 → 100644
View file @
a6bde7c0
File added
test/onnx/isinf_half_neg_test.onnx
0 → 100644
View file @
a6bde7c0
File added
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment