Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
68886f7d
"test/vscode:/vscode.git/clone" did not exist on "a98b8af448b2140dd4aebca2c16fe6c25a82be23"
Commit
68886f7d
authored
Jun 14, 2022
by
raman jana
Browse files
merging with latest develop branch
parents
a9ee2960
1677cf70
Changes
328
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
655 additions
and
855 deletions
+655
-855
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+5
-2
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+136
-319
profiler/src/profile_batched_gemm.cpp
profiler/src/profile_batched_gemm.cpp
+20
-20
profiler/src/profile_batched_gemm_reduce.cpp
profiler/src/profile_batched_gemm_reduce.cpp
+8
-8
profiler/src/profile_conv_bwd_data.cpp
profiler/src/profile_conv_bwd_data.cpp
+0
-195
profiler/src/profile_conv_bwd_weight.cpp
profiler/src/profile_conv_bwd_weight.cpp
+4
-4
profiler/src/profile_conv_fwd_bias_relu.cpp
profiler/src/profile_conv_fwd_bias_relu.cpp
+4
-4
profiler/src/profile_conv_fwd_bias_relu_add.cpp
profiler/src/profile_conv_fwd_bias_relu_add.cpp
+4
-4
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
+4
-4
profiler/src/profile_convnd_bwd_data.cpp
profiler/src/profile_convnd_bwd_data.cpp
+5
-5
profiler/src/profile_convnd_fwd.cpp
profiler/src/profile_convnd_fwd.cpp
+17
-17
profiler/src/profile_gemm.cpp
profiler/src/profile_gemm.cpp
+36
-20
profiler/src/profile_gemm_bias_2d.cpp
profiler/src/profile_gemm_bias_2d.cpp
+12
-12
profiler/src/profile_gemm_bias_relu.cpp
profiler/src/profile_gemm_bias_relu.cpp
+8
-8
profiler/src/profile_gemm_bias_relu_add.cpp
profiler/src/profile_gemm_bias_relu_add.cpp
+8
-8
profiler/src/profile_gemm_reduce.cpp
profiler/src/profile_gemm_reduce.cpp
+8
-8
profiler/src/profile_grouped_gemm.cpp
profiler/src/profile_grouped_gemm.cpp
+12
-8
profiler/src/profile_reduce.cpp
profiler/src/profile_reduce.cpp
+79
-155
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+3
-2
script/parse_perf_data.py
script/parse_perf_data.py
+282
-52
No files found.
profiler/include/profile_grouped_gemm_impl.hpp
View file @
68886f7d
...
@@ -43,13 +43,14 @@ namespace profiler {
...
@@ -43,13 +43,14 @@ namespace profiler {
template
<
typename
ADataType
,
template
<
typename
ADataType
,
typename
BDataType
,
typename
BDataType
,
typename
CDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
ALayout
,
typename
BLayout
,
typename
BLayout
,
typename
CLayout
>
typename
CLayout
>
void
profile_grouped_gemm_impl
(
int
do_verification
,
void
profile_grouped_gemm_impl
(
int
do_verification
,
int
init_method
,
int
init_method
,
bool
do_log
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
const
std
::
vector
<
int
>&
Ms
,
const
std
::
vector
<
int
>&
Ms
,
const
std
::
vector
<
int
>&
Ns
,
const
std
::
vector
<
int
>&
Ns
,
const
std
::
vector
<
int
>&
Ks
,
const
std
::
vector
<
int
>&
Ks
,
...
@@ -231,7 +232,8 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -231,7 +232,8 @@ void profile_grouped_gemm_impl(int do_verification,
{
{
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
0
,
num_btype
=
0
;
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
...
@@ -270,6 +272,7 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -270,6 +272,7 @@ void profile_grouped_gemm_impl(int do_verification,
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
BDataType
,
CDataType
,
CDataType
,
AccDataType
,
AElementOp
,
AElementOp
,
BElementOp
,
BElementOp
,
CElementOp
>
;
CElementOp
>
;
...
...
profiler/include/profile_reduce_impl.hpp
View file @
68886f7d
...
@@ -5,74 +5,77 @@
...
@@ -5,74 +5,77 @@
#include "device_reduce_instance.hpp"
#include "device_reduce_instance.hpp"
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
#include "host_reduction.hpp"
#include "host_reduction.hpp"
#include "host_common_util.hpp"
#include "host_tensor_generator.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
device
{
namespace
device
{
namespace
device_reduce_instance
{
namespace
device_reduce_instance
{
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
int
NanOpt
,
int
IndicesOpt
>
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
bool
PropagateNan
,
bool
UseIndex
>
struct
ReduceDescription
struct
ReduceDescription
{
{
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
NumReduceDim_
=
NumReduceDim
;
static
constexpr
int
NumReduceDim_
=
NumReduceDim
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
NanOpt_
=
NanOpt
;
static
constexpr
int
PropagateNan_
=
PropagateNan
;
static
constexpr
int
IndicesOpt_
=
IndicesOpt
;
static
constexpr
int
UseIndex_
=
UseIndex
;
};
};
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
0
,
0
>
,
// for ADD
using
reduce_description_instances
=
ReduceDescription
<
4
,
4
,
0
,
0
,
0
>
,
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
false
,
false
>
,
// for ADD
ReduceDescription
<
4
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
4
,
0
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
5
,
0
,
0
>
,
// for AVG
ReduceDescription
<
4
,
4
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
5
,
false
,
false
>
,
// for AVG
ReduceDescription
<
4
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
4
,
5
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
7
,
0
,
0
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
7
,
false
,
false
>
,
// for NORM2
ReduceDescription
<
4
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
4
,
7
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
0
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
false
>
,
// for MIN
ReduceDescription
<
4
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
4
,
2
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
0
>
,
// for MAX
ReduceDescription
<
2
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
4
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
false
>
,
// for MAX
ReduceDescription
<
4
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
4
,
3
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
0
>
,
// for AMAX
ReduceDescription
<
2
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
4
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
false
>
,
// for AMAX
ReduceDescription
<
4
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
4
,
4
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
1
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
true
>
,
// for MIN
ReduceDescription
<
4
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
4
,
2
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
1
>
,
// for MAX
ReduceDescription
<
2
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
4
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
true
>
,
// for MAX
ReduceDescription
<
4
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
4
,
3
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
1
>
,
// for AMAX
ReduceDescription
<
2
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
4
,
4
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
true
>
,
// for AMAX
ReduceDescription
<
4
,
1
,
4
,
0
,
1
>
,
ReduceDescription
<
4
,
4
,
4
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
1
>>
;
ReduceDescription
<
4
,
1
,
4
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
true
>>
;
template
<
typename
DescriptionType
>
template
<
typename
DescriptionType
>
bool
description_match
(
const
DescriptionType
&
description
,
bool
description_match
(
const
DescriptionType
&
description
,
int
Rank
,
int
Rank
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
bool
Propagat
eNan
,
ReduceTensorIndices
IndicesOpt
)
bool
UseIndex
)
{
{
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
static_cast
<
int
>
(
ReduceOpId
)
||
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
static_cast
<
int
>
(
ReduceOpId
)
||
description
.
Nan
Opt
_
!=
static_cast
<
int
>
(
Nan
Opt
)
||
description
.
Propagate
Nan_
!=
static_cast
<
int
>
(
Propagate
Nan
)
||
description
.
IndicesOpt
_
!=
static_cast
<
int
>
(
IndicesOpt
))
description
.
UseIndex
_
!=
static_cast
<
int
>
(
UseIndex
))
return
(
false
);
return
(
false
);
if
(
DescriptionType
::
NumReduceDim_
!=
reduceDims
.
size
())
if
(
DescriptionType
::
NumReduceDim_
!=
reduceDims
.
size
())
...
@@ -116,48 +119,18 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
...
@@ -116,48 +119,18 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return
invariantDims
;
return
invariantDims
;
};
};
template
<
typename
T
>
static
void
dumpBufferToFile
(
const
char
*
fileName
,
T
*
data
,
size_t
dataNumItems
)
{
std
::
ofstream
outFile
(
fileName
,
std
::
ios
::
binary
);
if
(
outFile
)
{
outFile
.
write
(
reinterpret_cast
<
char
*>
(
data
),
dataNumItems
*
sizeof
(
T
));
outFile
.
close
();
std
::
cout
<<
"Write output to file "
<<
fileName
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Could not open file "
<<
fileName
<<
" for writing"
<<
std
::
endl
;
}
};
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template
<
typename
InType
>
struct
type_mapping
{
using
OutType
=
InType
;
};
template
<
>
struct
type_mapping
<
ck
::
half_t
>
{
using
OutType
=
half_float
::
half
;
};
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
bool
Propagat
eNan
,
ReduceTensorIndices
IndicesOpt
>
bool
UseIndex
>
void
profile_reduce_impl_impl
(
bool
do_verification
,
bool
profile_reduce_impl_impl
(
bool
do_verification
,
int
init_method
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
bool
do_dumpout
,
int
nrepeat
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
float
alpha
,
...
@@ -165,16 +138,13 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -165,16 +138,13 @@ void profile_reduce_impl_impl(bool do_verification,
{
{
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
host_reduc
e
;
using
ck
::
host_common
::
dumpBufferToFil
e
;
constexpr
bool
op_support_indices
=
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
constexpr
bool
OutputIndex
=
(
op_support_indices
&&
UseIndex
);
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
));
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
PROPAGATE_NAN
);
constexpr
bool
out_support_atomic_add
=
std
::
is_same
<
OutDataType
,
float
>::
value
;
constexpr
bool
out_support_atomic_add
=
std
::
is_same
<
OutDataType
,
float
>::
value
;
constexpr
bool
op_support_atomic_add
=
constexpr
bool
op_support_atomic_add
=
...
@@ -195,8 +165,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -195,8 +165,7 @@ void profile_reduce_impl_impl(bool do_verification,
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
// 1) The indices can only be used when the reduction operation is indexable
// 1) The indices can only be used when the reduction operation is indexable
constexpr
bool
invalid_reduce_3
=
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
UseIndex
);
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
// 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
// 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
// 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
// 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
...
@@ -219,6 +188,8 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -219,6 +188,8 @@ void profile_reduce_impl_impl(bool do_verification,
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
||
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
||
invalid_reduce_4
||
invalid_reduce_5
||
invalid_reduce_6
);
invalid_reduce_4
||
invalid_reduce_5
||
invalid_reduce_6
);
bool
pass
=
true
;
if
constexpr
(
!
invalid_reduce
)
if
constexpr
(
!
invalid_reduce
)
{
{
Tensor
<
InDataType
>
in
(
inLengths
);
Tensor
<
InDataType
>
in
(
inLengths
);
...
@@ -282,42 +253,26 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -282,42 +253,26 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
out_dev
.
ToDevice
(
out
.
mData
.
data
());
size_t
indicesSizeInBytes
=
NeedIndices
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
DeviceMem
out_indices_dev
(
indicesSizeInBytes
);
DeviceMem
out_indices_dev
(
indicesSizeInBytes
);
float
best_avg_time
=
0
;
float
best_avg_time
=
0
;
float
best_gb_per_sec
=
0
;
float
best_gb_per_sec
=
0
;
using
InElementwiseOperation
_0
=
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
InElementwiseOperation
;
using
AccElementwiseOperation
_0
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
AccElementwiseOperation
;
using
InElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
InElementwiseOperation
;
using
AccElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
;
using
InElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstPtr0
=
using
DeviceReduceInstPtr0
=
DeviceReducePtr
<
InElementwiseOperation_0
,
AccElementwiseOperation_0
>
;
DeviceReducePtr
<
InElementwiseOperation
,
AccElementwiseOperation
>
;
using
DeviceReduceInstPtr1
=
DeviceReducePtr
<
InElementwiseOperation_1
,
AccElementwiseOperation_1
>
;
using
DeviceReduceInstPtr2
=
DeviceReducePtr
<
InElementwiseOperation_2
,
AccElementwiseOperation_2
>
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
std
::
vector
<
DeviceReduceInstPtr1
>
reduce1_ptrs
;
std
::
vector
<
DeviceReduceInstPtr2
>
reduce2_ptrs
;
add_device_reduce_instance_threadwise
<
InDataType
,
add_device_reduce_instance_threadwise
<
InDataType
,
AccDataType
,
AccDataType
,
...
@@ -325,8 +280,8 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -325,8 +280,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
ReduceOpId
,
ReduceOpId
,
Nan
Opt
,
Propagate
Nan
,
IndicesOpt
>
(
reduce0_ptrs
);
UseIndex
>
(
reduce0_ptrs
);
add_device_reduce_instance_blockwise
<
InDataType
,
add_device_reduce_instance_blockwise
<
InDataType
,
AccDataType
,
AccDataType
,
...
@@ -334,8 +289,8 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -334,8 +289,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
ReduceOpId
,
ReduceOpId
,
Nan
Opt
,
Propagate
Nan
,
IndicesOpt
>
(
reduce0_ptrs
);
UseIndex
>
(
reduce0_ptrs
);
if
constexpr
(
use_atomic_add
)
if
constexpr
(
use_atomic_add
)
{
{
...
@@ -345,35 +300,11 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -345,35 +300,11 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
ReduceOpId
,
ReduceOpId
,
Nan
Opt
,
Propagate
Nan
,
IndicesOpt
>
(
reduce0_ptrs
);
UseIndex
>
(
reduce0_ptrs
);
}
}
else
{
add_device_reduce_instance_multiblock_partial_reduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce1_ptrs
);
};
// used for secondary reduction
if
constexpr
(
!
use_atomic_add
)
{
add_device_reduce_instance_blockwise_second_call
<
AccDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce2_ptrs
);
};
if
(
reduce0_ptrs
.
empty
()
&&
reduce1_ptrs
.
empty
()
)
if
(
reduce0_ptrs
.
empty
())
{
{
throw
std
::
runtime_error
(
"Wrong! No device REDUCE instance found"
);
throw
std
::
runtime_error
(
"Wrong! No device REDUCE instance found"
);
};
};
...
@@ -383,31 +314,34 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -383,31 +314,34 @@ void profile_reduce_impl_impl(bool do_verification,
ReductionHost
<
InDataType
,
ReductionHost
<
InDataType
,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
PropagateNan
,
PropagateNan
,
NeedIndices
>
OutputIndex
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
};
};
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
std
::
vector
<
ck
::
index_t
>
i_inStrides
;
const
auto
i_outLengths
=
to_int_vector
(
outLengths
);
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
const
auto
i_outStrides
=
to_int_vector
(
outStrides
);
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
{
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
InElementwiseOperation_0
in_elementwise_op_0
(
static_cast
<
int32_t
>
(
reduce_total_length
));
InElementwiseOperation
in_elementwise_op
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_0
acc_elementwise_op_0
(
AccElementwiseOperation
acc_elementwise_op
(
static_cast
<
int32_t
>
(
reduce_total_length
));
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_inStrides
,
...
@@ -417,11 +351,11 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -417,11 +351,11 @@ void profile_reduce_impl_impl(bool do_verification,
alpha
,
alpha
,
beta
,
beta
,
in_dev
.
GetDeviceBuffer
(),
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op
,
in_elementwise_op_0
,
acc_elementwise_op
);
acc_elementwise_op_0
);
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
continue
;
...
@@ -430,7 +364,8 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -430,7 +364,8 @@ void profile_reduce_impl_impl(bool do_verification,
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
...
@@ -438,8 +373,9 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -438,8 +373,9 @@ void profile_reduce_impl_impl(bool do_verification,
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
if
(
time_kernel
)
<<
std
::
endl
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
if
(
gb_per_sec
>
best_gb_per_sec
)
{
{
...
@@ -449,22 +385,24 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -449,22 +385,24 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
bool
single_pass
;
out_dev
.
FromDevice
(
out
.
mData
.
data
());
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
single_pass
=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
if
(
OutputIndex
)
{
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
single_pass
=
single_pass
&&
;
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
)
;
};
};
if
(
do_log
)
if
(
!
single_pass
)
{
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
std
::
cout
<<
"Fail Info: "
<<
reduce_ptr
->
GetTypeString
()
<<
std
::
endl
;
<<
std
::
endl
;
}
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
;
pass
=
pass
&&
single_pass
;
};
};
if
(
do_dumpout
)
if
(
do_dumpout
)
...
@@ -473,7 +411,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -473,7 +411,7 @@ void profile_reduce_impl_impl(bool do_verification,
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
if
(
OutputIndex
)
{
{
dumpBufferToFile
(
"dump_indices.bin"
,
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
out_indices
.
mData
.
data
(),
...
@@ -485,156 +423,34 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -485,156 +423,34 @@ void profile_reduce_impl_impl(bool do_verification,
};
};
};
};
for
(
auto
&
reduce_ptr
:
reduce1_ptrs
)
if
(
time_kernel
)
{
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
<<
std
::
endl
;
DeviceMem
ws_dev
(
wsSizeInBytes
);
InElementwiseOperation_1
in_elementwise_op_1
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_1
acc_elementwise_op_1
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_1
,
acc_elementwise_op_1
);
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
std
::
string
reduce_name
=
reduce_ptr
->
GetTypeString
();
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
std
::
vector
<
int
>
inLengths2
=
reduce_ptr
->
GetWorkspace2dLengths
(
argument_ptr
.
get
());
std
::
vector
<
int
>
inStrides2
{
inLengths2
[
1
],
1
};
for
(
auto
&
reduce2_ptr
:
reduce2_ptrs
)
{
InElementwiseOperation_2
in_elementwise_op_2
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_2
acc_elementwise_op_2
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument2_ptr
=
reduce2_ptr
->
MakeArgumentPointer
(
inLengths2
,
inStrides2
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
ws_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_2
,
acc_elementwise_op_2
);
if
(
!
reduce2_ptr
->
IsSupportedArgument
(
argument2_ptr
.
get
()))
continue
;
std
::
string
reduce2_name
=
reduce2_ptr
->
GetTypeString
();
auto
invoker2_ptr
=
reduce2_ptr
->
MakeInvokerPointer
();
float
avg_time_2
=
invoker2_ptr
->
Run
(
argument2_ptr
.
get
(),
nrepeat
);
std
::
size_t
num_bytes_2
=
static_cast
<
size_t
>
(
inLengths2
[
0
])
*
inLengths2
[
1
]
*
sizeof
(
AccDataType
);
float
gb_per_sec
=
(
num_bytes
+
num_bytes_2
)
/
1.E6
/
(
avg_time
+
avg_time_2
);
std
::
cout
<<
"Perf: "
<<
(
avg_time
+
avg_time_2
)
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
" => "
<<
reduce2_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
best_avg_time
=
avg_time
+
avg_time_2
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
};
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
}
if
(
do_dumpout
)
{
dumpBufferToFile
(
"dump_in.bin"
,
in
.
mData
.
data
(),
in
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
out_indices
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_indices_host.bin"
,
out_indices_ref
.
mData
.
data
(),
out_indices_ref
.
mDesc
.
GetElementSize
());
};
};
};
};
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
}
}
else
else
{
{
std
::
cout
<<
"The requested reduction operation is not supported, please check !!!"
std
::
cout
<<
"The requested reduction operation is not supported, please check !!!"
<<
std
::
endl
;
<<
std
::
endl
;
};
};
return
pass
;
};
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
>
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
>
void
profile_reduce_impl
(
bool
do_verification
,
bool
profile_reduce_impl
(
bool
do_verification
,
int
init_method
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
bool
do_dumpout
,
int
nrepeat
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
ReduceOpId
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
bool
Propagat
eNan
,
ReduceTensorIndices
IndicesOpt
,
bool
UseIndex
,
float
alpha
,
float
alpha
,
float
beta
)
float
beta
)
{
{
bool
matched
=
false
;
bool
matched
=
false
;
bool
pass
=
true
;
using
tuple_of_description_instances
=
using
tuple_of_description_instances
=
tensor_operation
::
device
::
device_reduce_instance
::
reduce_description_instances
;
tensor_operation
::
device
::
device_reduce_instance
::
reduce_description_instances
;
...
@@ -648,29 +464,30 @@ void profile_reduce_impl(bool do_verification,
...
@@ -648,29 +464,30 @@ void profile_reduce_impl(bool do_verification,
using
descType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
using
descType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
if
(
!
description_match
(
if
(
!
description_match
(
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
NanOpt
,
IndicesOpt
))
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
PropagateNan
,
UseIndex
))
return
;
return
;
profile_reduce_impl_impl
<
InDataType
,
pass
=
pass
&&
AccDataType
,
profile_reduce_impl_impl
<
InDataType
,
OutDataType
,
AccDataType
,
descType
::
Rank_
,
OutDataType
,
descType
::
NumReduceDim_
,
descType
::
Rank_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
descType
::
NumReduceDim_
,
static_cast
<
NanPropagation
>
(
descType
::
NanOpt_
),
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
ReduceTensorIndices
>
(
descType
::
IndicesOpt_
)
>
(
static_cast
<
bool
>
(
descType
::
PropagateNan_
),
do_verification
,
static_cast
<
bool
>
(
descType
::
UseIndex_
)
>
(
do_verification
,
init_method
,
init_method
,
do_log
,
do_dumpout
,
do_dumpout
,
time_kernel
,
nrepeat
,
inLengths
,
inLengths
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
);
beta
);
matched
=
true
;
matched
=
true
;
});
});
return
pass
;
};
};
}
// namespace profiler
}
// namespace profiler
...
...
profiler/src/profile_batched_gemm.cpp
View file @
68886f7d
...
@@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[])
printf
(
" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])
\n
"
);
printf
(
" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount
\n
"
);
exit
(
1
);
exit
(
1
);
}
}
...
@@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
@@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -396,5 +396,5 @@ int profile_batched_gemm(int argc, char* argv[])
...
@@ -396,5 +396,5 @@ int profile_batched_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_batched_gemm_reduce.cpp
View file @
68886f7d
...
@@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
...
@@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount
\n
"
);
printf
(
"arg15: split k into mulitiple batch
\n
"
);
printf
(
"arg15: split k into mulitiple batch
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
...
@@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
@@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
...
@@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
...
@@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
...
@@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
...
@@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -149,5 +149,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
...
@@ -149,5 +149,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_conv_bwd_data.cpp
deleted
100644 → 0
View file @
a9ee2960
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_conv_bwd_data_impl.hpp"
enum
struct
ConvDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
};
enum
struct
ConvInputLayout
{
NCHW
,
// 0
NHWC
,
// 1
};
enum
struct
ConvWeightLayout
{
KCYX
,
// 0
KYXC
,
// 1
};
enum
struct
ConvOutputLayout
{
NKHW
,
// 0
NHWK
,
// 1
};
int
profile_conv_bwd_data
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
25
)
{
printf
(
"arg1: tensor operation (conv_bwd: BackwardConvolution)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16)
\n
"
);
printf
(
"arg3: input tensor layout (0: NCHW; 1: NHWC)
\n
"
);
printf
(
"arg4: weight tensor layout (0: KCYX; 1: KYXC)
\n
"
);
printf
(
"arg5: output tensor layout (0: NKHW; 1: NHWK)
\n
"
);
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9: run kernel # of times (>1)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
ConvDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
in_layout
=
static_cast
<
ConvInputLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
auto
wei_layout
=
static_cast
<
ConvWeightLayout
>
(
std
::
stoi
(
argv
[
4
]));
const
auto
out_layout
=
static_cast
<
ConvOutputLayout
>
(
std
::
stoi
(
argv
[
5
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
const
ck
::
index_t
C
=
std
::
stoi
(
argv
[
12
]);
const
ck
::
index_t
Y
=
std
::
stoi
(
argv
[
13
]);
const
ck
::
index_t
X
=
std
::
stoi
(
argv
[
14
]);
const
ck
::
index_t
Hi
=
std
::
stoi
(
argv
[
15
]);
const
ck
::
index_t
Wi
=
std
::
stoi
(
argv
[
16
]);
const
ck
::
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
17
]);
const
ck
::
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
18
]);
const
ck
::
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
19
]);
const
ck
::
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
20
]);
const
ck
::
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
21
]);
const
ck
::
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
22
]);
const
ck
::
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
23
]);
const
ck
::
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
24
]);
const
ck
::
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
ck
::
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
if
(
data_type
==
ConvDataType
::
F32_F32_F32
&&
in_layout
==
ConvInputLayout
::
NHWC
&&
wei_layout
==
ConvWeightLayout
::
KYXC
&&
out_layout
==
ConvOutputLayout
::
NHWK
)
{
ck
::
profiler
::
profile_conv_bwd_data_impl
<
2
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
N
,
K
,
C
,
std
::
vector
<
ck
::
index_t
>
{
Hi
,
Wi
},
std
::
vector
<
ck
::
index_t
>
{
Y
,
X
},
std
::
vector
<
ck
::
index_t
>
{
Ho
,
Wo
},
std
::
vector
<
ck
::
index_t
>
{
conv_stride_h
,
conv_stride_w
},
std
::
vector
<
ck
::
index_t
>
{
conv_dilation_h
,
conv_dilation_w
},
std
::
vector
<
ck
::
index_t
>
{
in_left_pad_h
,
in_left_pad_w
},
std
::
vector
<
ck
::
index_t
>
{
in_right_pad_h
,
in_right_pad_w
});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
&&
in_layout
==
ConvInputLayout
::
NHWC
&&
wei_layout
==
ConvWeightLayout
::
KYXC
&&
out_layout
==
ConvOutputLayout
::
NHWK
)
{
ck
::
profiler
::
profile_conv_bwd_data_impl
<
2
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
N
,
K
,
C
,
std
::
vector
<
ck
::
index_t
>
{
Hi
,
Wi
},
std
::
vector
<
ck
::
index_t
>
{
Y
,
X
},
std
::
vector
<
ck
::
index_t
>
{
Ho
,
Wo
},
std
::
vector
<
ck
::
index_t
>
{
conv_stride_h
,
conv_stride_w
},
std
::
vector
<
ck
::
index_t
>
{
conv_dilation_h
,
conv_dilation_w
},
std
::
vector
<
ck
::
index_t
>
{
in_left_pad_h
,
in_left_pad_w
},
std
::
vector
<
ck
::
index_t
>
{
in_right_pad_h
,
in_right_pad_w
});
}
else
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
&&
in_layout
==
ConvInputLayout
::
NHWC
&&
wei_layout
==
ConvWeightLayout
::
KYXC
&&
out_layout
==
ConvOutputLayout
::
NHWK
)
{
ck
::
profiler
::
profile_conv_bwd_data_impl
<
2
,
uint16_t
,
uint16_t
,
uint16_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
N
,
K
,
C
,
std
::
vector
<
ck
::
index_t
>
{
Hi
,
Wi
},
std
::
vector
<
ck
::
index_t
>
{
Y
,
X
},
std
::
vector
<
ck
::
index_t
>
{
Ho
,
Wo
},
std
::
vector
<
ck
::
index_t
>
{
conv_stride_h
,
conv_stride_w
},
std
::
vector
<
ck
::
index_t
>
{
conv_dilation_h
,
conv_dilation_w
},
std
::
vector
<
ck
::
index_t
>
{
in_left_pad_h
,
in_left_pad_w
},
std
::
vector
<
ck
::
index_t
>
{
in_right_pad_h
,
in_right_pad_w
});
}
else
if
(
data_type
==
ConvDataType
::
INT8_INT8_INT8
&&
in_layout
==
ConvInputLayout
::
NHWC
&&
wei_layout
==
ConvWeightLayout
::
KYXC
&&
out_layout
==
ConvOutputLayout
::
NHWK
)
{
ck
::
profiler
::
profile_conv_bwd_data_impl
<
2
,
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
N
,
K
,
C
,
std
::
vector
<
ck
::
index_t
>
{
Hi
,
Wi
},
std
::
vector
<
ck
::
index_t
>
{
Y
,
X
},
std
::
vector
<
ck
::
index_t
>
{
Ho
,
Wo
},
std
::
vector
<
ck
::
index_t
>
{
conv_stride_h
,
conv_stride_w
},
std
::
vector
<
ck
::
index_t
>
{
conv_dilation_h
,
conv_dilation_w
},
std
::
vector
<
ck
::
index_t
>
{
in_left_pad_h
,
in_left_pad_w
},
std
::
vector
<
ck
::
index_t
>
{
in_right_pad_h
,
in_right_pad_w
});
}
else
{
throw
std
::
runtime_error
(
"wrong! this Conv data_type & layout is not implemented"
);
}
return
1
;
}
profiler/src/profile_conv_bwd_weight.cpp
View file @
68886f7d
...
@@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
...
@@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
...
@@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
...
@@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
N
,
N
,
K
,
K
,
C
,
C
,
...
@@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
...
@@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
N
,
N
,
K
,
K
,
C
,
C
,
...
@@ -142,5 +142,5 @@ int profile_conv_bwd_weight(int argc, char* argv[])
...
@@ -142,5 +142,5 @@ int profile_conv_bwd_weight(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this Conv data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this Conv data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_conv_fwd_bias_relu.cpp
View file @
68886f7d
...
@@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
...
@@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg9:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
"RightPx
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
...
@@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
...
@@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
...
@@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
N
,
N
,
K
,
K
,
C
,
C
,
...
@@ -110,5 +110,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
...
@@ -110,5 +110,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_conv_fwd_bias_relu_add.cpp
View file @
68886f7d
...
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
...
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg9:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
"RightPx
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
...
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
...
@@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
...
@@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
N
,
N
,
K
,
K
,
C
,
C
,
...
@@ -111,5 +111,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
...
@@ -111,5 +111,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
View file @
68886f7d
...
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
...
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg9:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
"RightPx
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
...
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
...
@@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
...
@@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
N
,
N
,
K
,
K
,
C
,
C
,
...
@@ -112,5 +112,5 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
...
@@ -112,5 +112,5 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_convnd_bwd_data.cpp
View file @
68886f7d
...
@@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
...
@@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg9:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
"RightPx
\n
"
);
return
1
;
return
1
;
...
@@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
...
@@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
ck
::
utils
::
conv
::
ConvParams
params
=
parse_conv_params
(
num_dim_spatial
,
argv
,
preParams
);
ck
::
utils
::
conv
::
ConvParams
params
=
parse_conv_params
(
num_dim_spatial
,
argv
,
preParams
);
...
@@ -132,7 +132,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
...
@@ -132,7 +132,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
params
.
N_
,
params
.
N_
,
params
.
K_
,
params
.
K_
,
params
.
C_
,
params
.
C_
,
...
@@ -157,7 +157,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
...
@@ -157,7 +157,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
params
.
N_
,
params
.
N_
,
params
.
K_
,
params
.
K_
,
params
.
C_
,
params
.
C_
,
...
@@ -182,7 +182,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
...
@@ -182,7 +182,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
params
.
N_
,
params
.
N_
,
params
.
K_
,
params
.
K_
,
params
.
C_
,
params
.
C_
,
...
...
profiler/src/profile_convnd_fwd.cpp
View file @
68886f7d
...
@@ -119,7 +119,7 @@ template <int NDim,
...
@@ -119,7 +119,7 @@ template <int NDim,
void
profile_convnd_instances_impl
(
const
ck
::
utils
::
conv
::
ConvParams
&
params
,
void
profile_convnd_instances_impl
(
const
ck
::
utils
::
conv
::
ConvParams
&
params
,
bool
do_verification
,
bool
do_verification
,
bool
do_log
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
int
init_method
,
int
init_method
,
ConvLayouts
)
ConvLayouts
)
{
{
...
@@ -185,7 +185,7 @@ void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
...
@@ -185,7 +185,7 @@ void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
reference_conv_fwd_fun
);
reference_conv_fwd_fun
);
auto
best_conf
=
run_engine
.
Profile
(
auto
best_conf
=
run_engine
.
Profile
(
conv
::
ConvolutionFwdInstances
<
InDataType
,
WeiDataType
,
OutDataType
>::
template
Get
<
NDim
>(),
conv
::
ConvolutionFwdInstances
<
InDataType
,
WeiDataType
,
OutDataType
>::
template
Get
<
NDim
>(),
nrepeat
,
time_kernel
,
do_verification
,
do_verification
,
do_log
);
do_log
);
...
@@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type,
const
ck
::
utils
::
conv
::
ConvParams
&
params
,
const
ck
::
utils
::
conv
::
ConvParams
&
params
,
bool
do_verification
,
bool
do_verification
,
bool
do_log
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
int
init_method
)
int
init_method
)
{
{
switch
(
data_layout
)
switch
(
data_layout
)
...
@@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
params
,
do_verification
,
do_verification
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
break
;
break
;
...
@@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
params
,
do_verification
,
do_verification
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
break
;
break
;
...
@@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
params
,
do_verification
,
do_verification
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
break
;
break
;
...
@@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
params
,
do_verification
,
do_verification
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
break
;
break
;
...
@@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
params
,
do_verification
,
do_verification
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
break
;
break
;
...
@@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
params
,
do_verification
,
do_verification
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
break
;
break
;
...
@@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
params
,
do_verification
,
do_verification
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
break
;
break
;
...
@@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type,
...
@@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
params
,
do_verification
,
do_verification
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
break
;
break
;
...
@@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
...
@@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
bool
do_verification
{
true
};
bool
do_verification
{
true
};
int
init_method
{
2
};
int
init_method
{
2
};
bool
do_log
{
false
};
bool
do_log
{
false
};
int
nrepeat
{
100
};
bool
time_kernel
{
false
};
int
num_dim_spatial
{
2
};
int
num_dim_spatial
{
2
};
ConvParams
params
;
ConvParams
params
;
...
@@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
...
@@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
do_verification
=
std
::
stoi
(
argv
[
4
]);
do_verification
=
std
::
stoi
(
argv
[
4
]);
init_method
=
std
::
stoi
(
argv
[
5
]);
init_method
=
std
::
stoi
(
argv
[
5
]);
do_log
=
std
::
stoi
(
argv
[
6
]);
do_log
=
std
::
stoi
(
argv
[
6
]);
nrepeat
=
std
::
stoi
(
argv
[
7
]);
time_kernel
=
std
::
stoi
(
argv
[
7
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
8
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
8
]);
}
}
if
(
argc
>=
10
)
if
(
argc
>=
10
)
...
@@ -332,20 +332,20 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
...
@@ -332,20 +332,20 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
{
{
case
1
:
case
1
:
profile_convnd_instances
<
1
>
(
profile_convnd_instances
<
1
>
(
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
nrepeat
,
init_method
);
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
time_kernel
,
init_method
);
break
;
break
;
case
2
:
case
2
:
profile_convnd_instances
<
2
>
(
profile_convnd_instances
<
2
>
(
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
nrepeat
,
init_method
);
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
time_kernel
,
init_method
);
break
;
break
;
case
3
:
case
3
:
profile_convnd_instances
<
3
>
(
profile_convnd_instances
<
3
>
(
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
nrepeat
,
init_method
);
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
time_kernel
,
init_method
);
break
;
break
;
default:
default:
throw
std
::
runtime_error
(
"profile_conv_fwd: unsupported num_dim_spatial value: "
+
throw
std
::
runtime_error
(
"profile_conv_fwd: unsupported num_dim_spatial value: "
+
std
::
to_string
(
num_dim_spatial
));
std
::
to_string
(
num_dim_spatial
));
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_gemm.cpp
View file @
68886f7d
...
@@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[])
...
@@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
@@ -68,13 +68,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -68,13 +68,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -88,13 +89,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -88,13 +89,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -108,13 +110,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -108,13 +110,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -128,13 +131,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -128,13 +131,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -146,6 +150,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -146,6 +150,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
@@ -154,7 +159,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -154,7 +159,7 @@ int profile_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -166,6 +171,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -166,6 +171,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
@@ -174,7 +180,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -174,7 +180,7 @@ int profile_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -186,6 +192,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -186,6 +192,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
@@ -194,7 +201,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -194,7 +201,7 @@ int profile_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -206,6 +213,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -206,6 +213,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
@@ -214,7 +222,7 @@ int profile_gemm(int argc, char* argv[])
...
@@ -214,7 +222,7 @@ int profile_gemm(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -228,13 +236,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -228,13 +236,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -248,13 +257,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -248,13 +257,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -268,13 +278,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -268,13 +278,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -288,13 +299,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -288,13 +299,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -308,13 +320,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -308,13 +320,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -328,13 +341,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -328,13 +341,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -348,13 +362,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -348,13 +362,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -368,13 +383,14 @@ int profile_gemm(int argc, char* argv[])
...
@@ -368,13 +383,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -388,5 +404,5 @@ int profile_gemm(int argc, char* argv[])
...
@@ -388,5 +404,5 @@ int profile_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_gemm_bias_2d.cpp
View file @
68886f7d
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: alpha
\n
"
);
printf
(
"arg14: alpha
\n
"
);
printf
(
"arg15: beta
\n
"
);
printf
(
"arg15: beta
\n
"
);
...
@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
...
@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_gemm_bias_relu.cpp
View file @
68886f7d
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
...
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
...
@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
...
@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
...
@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
...
@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
...
@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_gemm_bias_relu_add.cpp
View file @
68886f7d
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1
\n
"
);
printf
(
"arg15: split k into mulitiple batch
\n
"
);
printf
(
"arg15: split k into mulitiple batch
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
...
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
...
@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
...
@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
...
@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
...
@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
...
@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_gemm_reduce.cpp
View file @
68886f7d
...
@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
...
@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
...
@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
...
@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
...
@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
...
@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
...
@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification
,
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
...
@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_grouped_gemm.cpp
View file @
68886f7d
...
@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
printf
(
"arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)
\n
"
);
"64,64 64,64 128,128)
\n
"
);
exit
(
1
);
exit
(
1
);
...
@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
auto
Ms
=
argToIntArray
(
argv
[
8
]);
const
auto
Ms
=
argToIntArray
(
argv
[
8
]);
const
auto
Ns
=
argToIntArray
(
argv
[
9
]);
const
auto
Ns
=
argToIntArray
(
argv
[
9
]);
...
@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
@@ -86,7 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -86,7 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
Ms
,
Ms
,
Ns
,
Ns
,
Ks
,
Ks
,
...
@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
@@ -104,7 +106,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -104,7 +106,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
Ms
,
Ms
,
Ns
,
Ns
,
Ks
,
Ks
,
...
@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
@@ -122,7 +125,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -122,7 +125,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
Ms
,
Ms
,
Ns
,
Ns
,
Ks
,
Ks
,
...
@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
@@ -140,7 +144,7 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -140,7 +144,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
init_method
,
do_log
,
do_log
,
nrepeat
,
time_kernel
,
Ms
,
Ms
,
Ns
,
Ns
,
Ks
,
Ks
,
...
@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
...
@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
}
return
1
;
return
0
;
}
}
profiler/src/profile_reduce.cpp
View file @
68886f7d
#include <iostream>
#include <iostream>
#include <fstream>
#include <fstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <vector>
#include <vector>
#include <stdexcept>
#include <stdexcept>
#include <sstream>
#include <sstream>
#include <getopt.h>
#include <getopt.h>
#include "config.hpp"
#include "data_type_enum.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
#include "host_common_util.hpp"
#include "profile_reduce_impl.hpp"
#include "profile_reduce_impl.hpp"
using
namespace
std
;
using
namespace
std
;
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorIndices
;
using
ck
::
ReduceTensorOp
;
using
ck
::
ReduceTensorOp
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
...
@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
...
@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{
"bf16"
,
no_argument
,
nullptr
,
'?'
},
{
"bf16"
,
no_argument
,
nullptr
,
'?'
},
{
"dumpout"
,
required_argument
,
nullptr
,
'o'
},
{
"dumpout"
,
required_argument
,
nullptr
,
'o'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"log"
,
required_argument
,
nullptr
,
'l'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
{
nullptr
,
0
,
nullptr
,
0
}};
template
<
typename
T
>
static
T
getSingleValueFromString
(
const
string
&
valueStr
)
{
std
::
istringstream
iss
(
valueStr
);
T
val
;
iss
>>
val
;
return
(
val
);
};
template
<
typename
T
>
static
std
::
vector
<
T
>
getTypeValuesFromString
(
const
char
*
cstr_values
)
{
std
::
string
valuesStr
(
cstr_values
);
std
::
vector
<
T
>
values
;
std
::
size_t
pos
=
0
;
std
::
size_t
new_pos
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
while
(
new_pos
!=
std
::
string
::
npos
)
{
const
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
,
new_pos
-
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
pos
=
new_pos
+
1
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
};
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
return
(
values
);
}
enum
struct
AppDataType
{
appHalf
=
0
,
appFloat
=
1
,
appInt32
=
2
,
appInt8
=
3
,
appInt8x4
=
4
,
appBFloat16
=
5
,
appDouble
=
6
,
};
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
reduceDims
)
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
reduceDims
)
{
{
for
(
auto
dim
:
reduceDims
)
for
(
auto
dim
:
reduceDims
)
...
@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
...
@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
};
};
};
};
class
App
Args
class
ReduceProfiler
Args
{
{
private:
private:
int
option_index
=
0
;
int
option_index
=
0
;
...
@@ -130,26 +68,23 @@ class AppArgs
...
@@ -130,26 +68,23 @@ class AppArgs
std
::
vector
<
float
>
scales
;
std
::
vector
<
float
>
scales
;
ReduceTensorOp
reduceOp
=
ReduceTensorOp
::
ADD
;
ReduceTensorOp
reduceOp
=
ReduceTensorOp
::
ADD
;
App
DataType
compTypeId
=
App
DataType
::
app
Float
;
ck
::
DataType
Enum
compTypeId
=
ck
::
DataType
Enum
::
Float
;
App
DataType
outTypeId
=
App
DataType
::
app
Float
;
ck
::
DataType
Enum
outTypeId
=
ck
::
DataType
Enum
::
Float
;
bool
compType_assigned
=
false
;
bool
compType_assigned
=
false
;
bool
outType_assigned
=
false
;
bool
outType_assigned
=
false
;
NanPropagation
nanOpt
=
NanPropagation
::
NOT_PROPAGATE_NAN
;
int
nanOpt
=
0
;
ReduceTensorIndices
indicesOpt
=
ReduceTensorIndices
::
NO_INDICES
;
int
indicesOpt
=
0
;
bool
do_log
=
false
;
bool
do_verification
=
false
;
bool
do_verification
=
false
;
bool
do_dumpout
=
false
;
bool
do_dumpout
=
false
;
int
init_method
;
int
init_method
;
int
nrepeat
;
bool
time_kernel
;
bool
need_indices
=
false
;
ReduceProfilerArgs
()
=
default
;
~
ReduceProfilerArgs
()
=
default
;
AppArgs
()
=
default
;
~
AppArgs
()
=
default
;
void
show_usage
(
const
char
*
cmd
)
void
show_usage
(
const
char
*
cmd
)
{
{
...
@@ -166,8 +101,11 @@ class AppArgs
...
@@ -166,8 +101,11 @@ class AppArgs
std
::
cout
<<
"--outType or -W, optional enum value indicating the type of the reduced "
std
::
cout
<<
"--outType or -W, optional enum value indicating the type of the reduced "
"output, which could be float when the input data is half"
"output, which could be float when the input data is half"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"--nanOpt or -N, enum value indicates the selection for NanOpt"
<<
std
::
endl
;
std
::
cout
std
::
cout
<<
"--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
<<
"--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
<<
std
::
endl
;
std
::
cout
<<
"--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
"index in reduction"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"--scales or -S, comma separated two float values for alpha and beta"
std
::
cout
<<
"--scales or -S, comma separated two float values for alpha and beta"
<<
std
::
endl
;
<<
std
::
endl
;
...
@@ -181,18 +119,19 @@ class AppArgs
...
@@ -181,18 +119,19 @@ class AppArgs
std
::
cout
<<
"--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
std
::
cout
<<
"--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
"for further analysis"
"for further analysis"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"--log or -l, 1/0 to indicate whether to log some information"
<<
std
::
endl
;
};
};
int
processArgs
(
int
argc
,
char
*
argv
[])
int
processArgs
(
int
argc
,
char
*
argv
[])
{
{
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
int
ch
;
optind
++
;
// to skip the "reduce" module name
optind
++
;
// to skip the "reduce" module name
while
(
1
)
while
(
1
)
{
{
ch
=
getopt_long
(
argc
,
argv
,
"D:R:O:C:W:N:I:S:v:o:
l:
"
,
long_options
,
&
option_index
);
ch
=
getopt_long
(
argc
,
argv
,
"D:R:O:C:W:N:I:S:v:o:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
if
(
ch
==
-
1
)
break
;
break
;
switch
(
ch
)
switch
(
ch
)
...
@@ -219,27 +158,27 @@ class AppArgs
...
@@ -219,27 +158,27 @@ class AppArgs
if
(
!
optarg
)
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
throw
std
::
runtime_error
(
"Invalid option format!"
);
compTypeId
=
static_cast
<
App
DataType
>
(
std
::
atoi
(
optarg
));
compTypeId
=
static_cast
<
ck
::
DataType
Enum
>
(
std
::
atoi
(
optarg
));
compType_assigned
=
true
;
compType_assigned
=
true
;
break
;
break
;
case
'W'
:
case
'W'
:
if
(
!
optarg
)
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
throw
std
::
runtime_error
(
"Invalid option format!"
);
outTypeId
=
static_cast
<
App
DataType
>
(
std
::
atoi
(
optarg
));
outTypeId
=
static_cast
<
ck
::
DataType
Enum
>
(
std
::
atoi
(
optarg
));
outType_assigned
=
true
;
outType_assigned
=
true
;
break
;
break
;
case
'N'
:
case
'N'
:
if
(
!
optarg
)
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
throw
std
::
runtime_error
(
"Invalid option format!"
);
nanOpt
=
static_cast
<
NanPropagation
>
(
std
::
atoi
(
optarg
)
)
;
nanOpt
=
std
::
atoi
(
optarg
);
break
;
break
;
case
'I'
:
case
'I'
:
if
(
!
optarg
)
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
throw
std
::
runtime_error
(
"Invalid option format!"
);
indicesOpt
=
static_cast
<
ReduceTensorIndices
>
(
std
::
atoi
(
optarg
)
)
;
indicesOpt
=
std
::
atoi
(
optarg
);
break
;
break
;
case
'S'
:
case
'S'
:
if
(
!
optarg
)
if
(
!
optarg
)
...
@@ -262,12 +201,6 @@ class AppArgs
...
@@ -262,12 +201,6 @@ class AppArgs
do_dumpout
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
do_dumpout
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
break
;
break
;
case
'l'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
do_log
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
break
;
case
'?'
:
case
'?'
:
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"half"
)
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"half"
)
use_half
=
true
;
use_half
=
true
;
...
@@ -295,7 +228,7 @@ class AppArgs
...
@@ -295,7 +228,7 @@ class AppArgs
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
nrepeat
=
std
::
atoi
(
argv
[
optind
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
])
)
;
if
(
scales
.
empty
())
if
(
scales
.
empty
())
{
{
...
@@ -306,9 +239,6 @@ class AppArgs
...
@@ -306,9 +239,6 @@ class AppArgs
if
(
reduceOp
==
ReduceTensorOp
::
MIN
||
reduceOp
==
ReduceTensorOp
::
MAX
||
if
(
reduceOp
==
ReduceTensorOp
::
MIN
||
reduceOp
==
ReduceTensorOp
::
MAX
||
reduceOp
==
ReduceTensorOp
::
AMAX
)
reduceOp
==
ReduceTensorOp
::
AMAX
)
{
{
if
(
indicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
)
need_indices
=
true
;
// for indexable operations, no need to assign compType and outType, just let them be
// for indexable operations, no need to assign compType and outType, just let them be
// same as inType
// same as inType
compType_assigned
=
false
;
compType_assigned
=
false
;
...
@@ -322,9 +252,10 @@ class AppArgs
...
@@ -322,9 +252,10 @@ class AppArgs
int
profile_reduce
(
int
argc
,
char
*
argv
[])
int
profile_reduce
(
int
argc
,
char
*
argv
[])
{
{
using
namespace
ck
::
profiler
;
using
ck
::
DataTypeEnum
;
using
ck
::
profiler
::
profile_reduce_impl
;
App
Args
args
;
ReduceProfiler
Args
args
;
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
return
(
-
1
);
return
(
-
1
);
...
@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
...
@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
if
(
args
.
use_half
)
if
(
args
.
use_half
)
{
{
if
(
!
args
.
compType_assigned
)
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
App
DataType
::
app
Half
;
args
.
compTypeId
=
DataType
Enum
::
Half
;
if
(
args
.
outType_assigned
&&
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
Half
&&
args
.
outTypeId
!=
App
DataType
::
app
Float
))
(
args
.
outTypeId
!=
DataType
Enum
::
Half
&&
args
.
outTypeId
!=
DataType
Enum
::
Float
))
args
.
outTypeId
=
App
DataType
::
app
Float
;
args
.
outTypeId
=
DataType
Enum
::
Float
;
if
(
!
args
.
outType_assigned
)
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
Half
;
args
.
outTypeId
=
DataType
Enum
::
Half
;
if
(
args
.
compTypeId
==
App
DataType
::
app
Half
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Half
)
{
{
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
do_verification
,
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
init_method
,
args
.
do_verification
,
args
.
do_log
,
args
.
init_method
,
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Float
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Float
)
{
{
profile_reduce_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
args
.
do_verification
,
profile_reduce_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
...
@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
...
@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
{
{
profile_reduce_impl
<
double
,
double
,
double
>
(
args
.
do_verification
,
profile_reduce_impl
<
double
,
double
,
double
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
else
if
(
args
.
use_int8
)
else
if
(
args
.
use_int8
)
{
{
if
(
!
args
.
compType_assigned
)
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
App
DataType
::
app
Int8
;
args
.
compTypeId
=
DataType
Enum
::
Int8
;
if
(
args
.
outType_assigned
&&
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
Int8
&&
args
.
outTypeId
!=
App
DataType
::
app
Int32
))
(
args
.
outTypeId
!=
DataType
Enum
::
Int8
&&
args
.
outTypeId
!=
DataType
Enum
::
Int32
))
args
.
outTypeId
=
App
DataType
::
app
Int32
;
args
.
outTypeId
=
DataType
Enum
::
Int32
;
if
(
!
args
.
outType_assigned
)
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
Int8
;
args
.
outTypeId
=
DataType
Enum
::
Int8
;
if
(
args
.
compTypeId
==
App
DataType
::
app
Int8
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Int8
)
{
{
profile_reduce_impl
<
int8_t
,
int8_t
,
int8_t
>
(
args
.
do_verification
,
profile_reduce_impl
<
int8_t
,
int8_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Int32
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Int32
)
{
{
profile_reduce_impl
<
int8_t
,
int32_t
,
int8_t
>
(
args
.
do_verification
,
profile_reduce_impl
<
int8_t
,
int32_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
...
@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
...
@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
else
if
(
args
.
use_bf16
)
else
if
(
args
.
use_bf16
)
{
{
if
(
args
.
outType_assigned
&&
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
BFloat16
&&
args
.
outTypeId
!=
App
DataType
::
app
Float
))
(
args
.
outTypeId
!=
DataType
Enum
::
BFloat16
&&
args
.
outTypeId
!=
DataType
Enum
::
Float
))
args
.
outTypeId
=
App
DataType
::
app
Float
;
args
.
outTypeId
=
DataType
Enum
::
Float
;
if
(
!
args
.
outType_assigned
)
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
BFloat16
;
args
.
outTypeId
=
DataType
Enum
::
BFloat16
;
profile_reduce_impl
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>
(
args
.
do_verification
,
profile_reduce_impl
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
else
else
{
{
if
(
args
.
compTypeId
==
App
DataType
::
app
Float
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Float
)
{
{
profile_reduce_impl
<
float
,
float
,
float
>
(
args
.
do_verification
,
profile_reduce_impl
<
float
,
float
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Double
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Double
)
{
{
profile_reduce_impl
<
float
,
double
,
float
>
(
args
.
do_verification
,
profile_reduce_impl
<
float
,
double
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
reduceOp
,
args
.
nanOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
...
...
profiler/src/profiler.cpp
View file @
68886f7d
...
@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
...
@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_atomic_add
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_atomic_add
(
int
,
char
*
[]);
...
@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
...
@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
}
}
else
if
(
strcmp
(
argv
[
1
],
"grouped_gemm"
)
==
0
)
else
if
(
strcmp
(
argv
[
1
],
"grouped_gemm"
)
==
0
)
{
{
profile_grouped_gemm
(
argc
,
argv
);
return
profile_grouped_gemm
(
argc
,
argv
);
}
}
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
{
{
...
@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
...
@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
" conv1d_bwd_data: BackwardConvolution data 1 dim
\n
"
" conv1d_bwd_data: BackwardConvolution data 1 dim
\n
"
" conv2d_bwd_data: BackwardConvolution data 2 dim
\n
"
" conv2d_bwd_data: BackwardConvolution data 2 dim
\n
"
" conv3d_bwd_data: BackwardConvolution data 3 dim
\n
"
" conv3d_bwd_data: BackwardConvolution data 3 dim
\n
"
" reduce: R
EDUCE
\n
"
" reduce: R
educe
\n
"
" conv2d_bwd_weight: Backward Weight Convolution 2d
\n
"
);
" conv2d_bwd_weight: Backward Weight Convolution 2d
\n
"
);
// clang-format on
// clang-format on
}
}
...
...
script/parse_perf_data.py
View file @
68886f7d
#!/usr/bin/env python3
#!/usr/bin/env python3
import
os
,
io
import
os
,
io
,
argparse
,
datetime
,
re
import
argparse
import
numpy
as
np
import
sqlalchemy
def
print_to_string
(
*
args
,
**
kwargs
):
from
sqlalchemy.types
import
NVARCHAR
,
Float
,
Integer
output
=
io
.
StringIO
()
import
pymysql
print
(
*
args
,
file
=
output
,
**
kwargs
)
import
pandas
as
pd
contents
=
output
.
getvalue
()
from
sshtunnel
import
SSHTunnelForwarder
output
.
close
()
return
contents
def
print_to_string
(
*
args
,
**
kwargs
):
output
=
io
.
StringIO
()
def
parse_args
():
print
(
*
args
,
file
=
output
,
**
kwargs
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Parse results from tf benchmark runs'
)
contents
=
output
.
getvalue
()
parser
.
add_argument
(
'filename'
,
type
=
str
,
help
=
'Log file to prase or directory containing log files'
)
output
.
close
()
args
=
parser
.
parse_args
()
return
contents
files
=
[]
if
os
.
path
.
isdir
(
args
.
filename
):
def
parse_args
():
all_files
=
os
.
listdir
(
args
.
filename
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Parse results from tf benchmark runs'
)
for
name
in
all_files
:
parser
.
add_argument
(
'filename'
,
type
=
str
,
help
=
'Log file to prase or directory containing log files'
)
if
not
'log'
in
name
:
args
=
parser
.
parse_args
()
continue
files
=
[]
files
.
append
(
os
.
path
.
join
(
args
.
filename
,
name
))
if
os
.
path
.
isdir
(
args
.
filename
):
else
:
all_files
=
os
.
listdir
(
args
.
filename
)
files
=
[
args
.
filename
]
for
name
in
all_files
:
args
.
files
=
files
if
not
'log'
in
name
:
return
args
continue
files
.
append
(
os
.
path
.
join
(
args
.
filename
,
name
))
def
main
():
else
:
args
=
parse_args
()
files
=
[
args
.
filename
]
results
=
[]
args
.
files
=
files
#parse results
return
args
glue
=
""
for
filename
in
args
.
files
:
def
main
():
for
line
in
open
(
filename
):
args
=
parse_args
()
if
'Best Perf'
in
line
:
tests
=
[]
lst
=
line
.
split
()
kernels
=
[]
results
.
append
(
print_to_string
(
glue
.
join
(
lst
[
8
:]),
lst
[
4
]))
tflops
=
[]
dtype
=
[]
#sort results
alayout
=
[]
blayout
=
[]
#read baseline results for the latest develop branch
M
=
[]
N
=
[]
#write new results to the db
K
=
[]
StrideA
=
[]
#compare the results to the baseline
StrideB
=
[]
StrideC
=
[]
#return 0 if performance criteria met, otherwise return 1
#parse results, get the Tflops value for "Best Perf" kernels
print
(
results
)
glue
=
""
return
0
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
if
__name__
==
'__main__'
:
if
'Branch name'
in
line
:
lst
=
line
.
split
()
branch_name
=
lst
[
2
]
if
'Node name'
in
line
:
lst
=
line
.
split
()
node_id
=
lst
[
2
]
if
'GPU_arch'
in
line
:
lst
=
line
.
split
()
gpu_arch
=
lst
[
1
]
if
'HIP version'
in
line
:
lst
=
line
.
split
()
hip_vers
=
lst
[
2
]
if
'InstalledDir'
in
line
:
lst
=
line
.
split
()
rocm_vers
=
lst
[
1
][
lst
[
1
].
find
(
'/opt/rocm-'
)
+
len
(
'/opt/rocm-'
):
lst
[
1
].
rfind
(
'/llvm/bin'
)]
print
(
"Branch name:"
,
branch_name
)
print
(
"Node name:"
,
node_id
)
print
(
"GPU_arch:"
,
gpu_arch
)
print
(
"ROCM_version:"
,
rocm_vers
)
print
(
"HIP_version:"
,
hip_vers
)
#parse gemm performance tests:
if
'gemm'
in
filename
:
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
if
len
(
lst
)
>=
37
:
#the line is complete
tests
.
append
(
glue
.
join
(
lst
[
5
:
30
]))
kernels
.
append
(
glue
.
join
(
lst
[
37
:]))
tflops
.
append
(
lst
[
33
])
dtype
.
append
(
lst
[
5
])
alayout
.
append
(
lst
[
8
])
blayout
.
append
(
lst
[
11
])
M
.
append
(
lst
[
14
])
N
.
append
(
lst
[
17
])
K
.
append
(
lst
[
20
])
StrideA
.
append
(
lst
[
23
])
StrideB
.
append
(
lst
[
26
])
StrideC
.
append
(
lst
[
29
])
elif
len
(
lst
)
<
37
and
len
(
lst
)
>=
33
:
#the tflops are available
tests
.
append
(
glue
.
join
(
lst
[
5
:
30
]))
kernels
.
append
(
"N/A"
)
tflops
.
append
(
lst
[
33
])
dtype
.
append
(
lst
[
5
])
alayout
.
append
(
lst
[
8
])
blayout
.
append
(
lst
[
11
])
M
.
append
(
lst
[
14
])
N
.
append
(
lst
[
17
])
K
.
append
(
lst
[
20
])
StrideA
.
append
(
lst
[
23
])
StrideB
.
append
(
lst
[
26
])
StrideC
.
append
(
lst
[
29
])
print
(
"warning: incomplete line:"
,
lst
)
elif
len
(
lst
)
<
33
:
#even the tflops are not available
print
(
"Error in ckProfiler output!"
)
print
(
"warning: incomplete line="
,
lst
)
#sort results
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops
=
[
x
for
_
,
x
in
sorted
(
zip
(
tests
,
tflops
))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list
=
list
(
range
(
1
,
len
(
tests
)
+
1
))
#parse resnet50 performance tests:
if
'resnet50'
in
filename
:
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
tflops
.
append
(
lst
[
4
])
print
(
"Number of tests:"
,
len
(
tflops
))
sql_hostname
=
'127.0.0.1'
sql_username
=
os
.
environ
[
"dbuser"
]
sql_password
=
os
.
environ
[
"dbpassword"
]
sql_main_database
=
'miopen_perf'
sql_port
=
3306
ssh_host
=
os
.
environ
[
"dbsship"
]
ssh_user
=
os
.
environ
[
"dbsshuser"
]
ssh_port
=
int
(
os
.
environ
[
"dbsshport"
])
ssh_pass
=
os
.
environ
[
"dbsshpassword"
]
with
SSHTunnelForwarder
(
(
ssh_host
,
ssh_port
),
ssh_username
=
ssh_user
,
ssh_password
=
ssh_pass
,
remote_bind_address
=
(
sql_hostname
,
sql_port
))
as
tunnel
:
sqlEngine
=
sqlalchemy
.
create_engine
(
'mysql+pymysql://{0}:{1}@{2}:{3}/{4}'
.
format
(
sql_username
,
sql_password
,
sql_hostname
,
tunnel
.
local_bind_port
,
sql_main_database
))
conn
=
sqlEngine
.
connect
()
#save gemm performance tests:
if
'gemm'
in
filename
:
#write the ck_gemm_test_params table
#only needed once the test set changes
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
print(df)
dtypes = {
'Test_number': Integer(),
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12),
'Blayout': NVARCHAR(length=12),
'M': Integer(),
'N': Integer(),
'K': Integer(),
'StrideA': Integer(),
'StrideB': Integer(),
'StrideC': Integer()
}
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
'''
#read baseline results for the latest develop branch
query
=
'''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
tflops_base
=
pd
.
read_sql_query
(
query
,
conn
)
#write new results to the db
testlist
=
[]
for
i
in
range
(
1
,
len
(
tests
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
ck_gemm_tflops
=
[
str
(
branch_name
),
str
(
node_id
),
str
(
gpu_arch
),
str
(
rocm_vers
),
str
(
hip_vers
),
str
(
datetime
.
datetime
.
now
())]
flops
=
pd
.
DataFrame
(
data
=
[
ck_gemm_tflops
],
columns
=
[
'Branch_ID'
,
'Node_ID'
,
'GPU_arch'
,
'ROCM_version'
,
'HIP_version'
,
'Datetime'
])
df_add
=
pd
.
DataFrame
(
data
=
[
sorted_tflops
],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops
,
df_add
],
axis
=
1
)
print
(
"new tflops for gemm tests:"
,
flops
)
flops
.
to_sql
(
"ck_gemm_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
#save resnet50 performance tests:
if
'resnet50'
in
filename
:
#read baseline results for the latest develop branch
query
=
'''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
tflops_base_N256
=
pd
.
read_sql_query
(
query
,
conn
)
query
=
'''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
tflops_base_N4
=
pd
.
read_sql_query
(
query
,
conn
)
#write new results to the db
testlist
=
[]
for
i
in
range
(
1
,
50
):
testlist
.
append
(
"Layer%i"
%
i
)
ck_resnet_tflops
=
[
str
(
branch_name
),
str
(
node_id
),
str
(
gpu_arch
),
str
(
rocm_vers
),
str
(
hip_vers
),
str
(
datetime
.
datetime
.
now
())]
flops0
=
pd
.
DataFrame
(
data
=
[
ck_resnet_tflops
],
columns
=
[
'Branch_ID'
,
'Node_ID'
,
'GPU_arch'
,
'ROCM_version'
,
'HIP_version'
,
'Datetime'
])
df_add
=
pd
.
DataFrame
(
data
=
[
tflops
[
0
:
49
]],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops0
,
df_add
],
axis
=
1
)
print
(
"new tflops for N=256 resnet50 test:"
,
flops
)
flops
.
to_sql
(
"ck_resnet50_N256_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
df_add
=
pd
.
DataFrame
(
data
=
[
tflops
[
49
:
98
]],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops0
,
df_add
],
axis
=
1
)
print
(
"new tflops for N=4 resnet50 test:"
,
flops
)
flops
.
to_sql
(
"ck_resnet50_N4_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
conn
.
close
()
#compare the results to the baseline if baseline exists
regression
=
0
if
'gemm'
in
filename
:
if
not
tflops_base
.
empty
:
base
=
tflops_base
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
ave_perf
=
0
for
i
in
range
(
len
(
base_list
)):
# success criterion:
if
base_list
[
i
]
>
1.01
*
float
(
sorted_tflops
[
i
]):
print
(
"test # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
sorted_tflops
[
i
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
sorted_tflops
[
i
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline"
)
if
'resnet50'
in
filename
:
if
not
tflops_base_N256
.
empty
:
base
=
tflops_base_N256
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
ave_perf
=
0
for
i
in
range
(
len
(
base_list
)):
# success criterion:
if
base_list
[
i
]
>
1.01
*
float
(
tflops
[
i
]):
print
(
"layer # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
tflops
[
i
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
tflops
[
i
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline for N=256"
)
if
not
tflops_base_N4
.
empty
:
base
=
tflops_base_N4
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
ave_perf
=
0
for
i
in
range
(
len
(
base_list
)):
# success criterion:
if
base_list
[
i
]
>
1.01
*
float
(
tflops
[
i
+
49
]):
print
(
"layer # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
tflops
[
i
+
49
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
tflops
[
i
+
49
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline for N=4"
)
#return 0 if performance criteria met, otherwise return 1
return
regression
if
__name__
==
'__main__'
:
main
()
main
()
\ No newline at end of file
Prev
1
…
11
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment