Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a3b4c5cb
Commit
a3b4c5cb
authored
Jun 03, 2022
by
wangshaojie6
Browse files
merge develop branch and add gridwise pipeline v3
parents
48918ab9
1677cf70
Changes
361
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
491 additions
and
904 deletions
+491
-904
profiler/include/profile_gemm_reduce_impl.hpp
profiler/include/profile_gemm_reduce_impl.hpp
+62
-45
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+16
-13
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+136
-319
profiler/src/profile_batched_gemm.cpp
profiler/src/profile_batched_gemm.cpp
+20
-20
profiler/src/profile_batched_gemm_reduce.cpp
profiler/src/profile_batched_gemm_reduce.cpp
+8
-8
profiler/src/profile_conv_bwd_data.cpp
profiler/src/profile_conv_bwd_data.cpp
+0
-195
profiler/src/profile_conv_bwd_weight.cpp
profiler/src/profile_conv_bwd_weight.cpp
+4
-4
profiler/src/profile_conv_fwd_bias_relu.cpp
profiler/src/profile_conv_fwd_bias_relu.cpp
+4
-4
profiler/src/profile_conv_fwd_bias_relu_add.cpp
profiler/src/profile_conv_fwd_bias_relu_add.cpp
+4
-4
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
+4
-4
profiler/src/profile_convnd_bwd_data.cpp
profiler/src/profile_convnd_bwd_data.cpp
+48
-48
profiler/src/profile_convnd_fwd.cpp
profiler/src/profile_convnd_fwd.cpp
+18
-18
profiler/src/profile_gemm.cpp
profiler/src/profile_gemm.cpp
+36
-20
profiler/src/profile_gemm_bias_2d.cpp
profiler/src/profile_gemm_bias_2d.cpp
+12
-12
profiler/src/profile_gemm_bias_relu.cpp
profiler/src/profile_gemm_bias_relu.cpp
+8
-8
profiler/src/profile_gemm_bias_relu_add.cpp
profiler/src/profile_gemm_bias_relu_add.cpp
+8
-8
profiler/src/profile_gemm_reduce.cpp
profiler/src/profile_gemm_reduce.cpp
+8
-8
profiler/src/profile_grouped_gemm.cpp
profiler/src/profile_grouped_gemm.cpp
+12
-8
profiler/src/profile_reduce.cpp
profiler/src/profile_reduce.cpp
+80
-156
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+3
-2
No files found.
profiler/include/profile_gemm_reduce_impl.hpp
View file @
a3b4c5cb
...
...
@@ -7,7 +7,7 @@
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "element_wise_operation.hpp"
#include "
element_wise_
reduc
e
_operat
ion
.hpp"
#include "reduc
tion
_operat
or
.hpp"
#include "device_gemm_reduce.hpp"
#include "reference_gemm.hpp"
...
...
@@ -16,12 +16,22 @@ namespace tensor_operation {
namespace
device
{
namespace
device_gemm_instance
{
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
DPtrsGlobal
=
ck
::
Tuple
<
F32
*
,
F32
*>
;
using
Div
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
F32
,
F32
,
true
>
;
using
Identity
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
F32
,
F32
,
false
>
;
using
Square
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
F32
,
F32
,
false
>
;
using
DInElementOps
=
ck
::
Tuple
<
Identity
,
Square
>
;
using
DOutElementOps
=
ck
::
Tuple
<
Div
,
Div
>
;
using
DeviceGemmReduceNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmReducePtr
<
DPtrsGlobal
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
ReduceSum
,
ck
::
tensor_operation
::
element_wise
::
ReduceSquareSum
>
;
DInElementOps
,
DOutElementOps
>
;
void
add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmReduceNoOpPtr
>&
);
...
...
@@ -53,7 +63,7 @@ template <typename ADataType,
bool
profile_gemm_reduce_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
int
M
,
int
N
,
int
K
,
...
...
@@ -113,11 +123,18 @@ bool profile_gemm_reduce_impl(int do_verification,
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
},
num_thread
);
}
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSum
;
using
D1ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSquareSum
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
<
float
>
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
<
float
>
;
using
UnaryDivElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
float
,
float
,
true
>
;
using
UnaryIdenticElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
float
,
float
,
false
>
;
using
UnarySquareElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
float
,
float
,
false
>
;
using
DxsInElementOps
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnarySquareElementOp
>
;
using
DxsOutElementOps
=
ck
::
Tuple
<
UnaryDivElementOp
,
UnaryDivElementOp
>
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
...
...
@@ -125,10 +142,18 @@ bool profile_gemm_reduce_impl(int do_verification,
const
auto
d0_reduce_op
=
D0ReduceOp
{};
const
auto
d1_reduce_op
=
D1ReduceOp
{};
auto
dxs_in_element_op
=
DxsInElementOps
{};
auto
dxs_out_element_op
=
DxsOutElementOps
{
M
,
M
};
if
(
do_verification
)
{
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
DDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
@@ -140,17 +165,25 @@ bool profile_gemm_reduce_impl(int do_verification,
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
Get
ReduceZero
Value
();
float
d1_acc
=
d1_reduce_op
.
Get
ReduceZero
Value
();
float
d0_acc
=
d0_reduce_op
.
Get
Identity
Value
();
float
d1_acc
=
d1_reduce_op
.
Get
Identity
Value
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
d0_reduce_op
.
Reduce
(
d0_acc
,
c_m_n_host_result
(
m
,
n
));
d1_reduce_op
.
Reduce
(
d1_acc
,
c_m_n_host_result
(
m
,
n
));
float
c_val
=
ck
::
type_convert
<
float
>
(
c_m_n_host_result
(
m
,
n
));
float
d0_val
=
0
;
float
d1_val
=
0
;
dxs_in_element_op
(
ck
::
Number
<
0
>
{})(
d0_val
,
c_val
);
dxs_in_element_op
(
ck
::
Number
<
1
>
{})(
d1_val
,
c_val
);
d0_reduce_op
(
d0_acc
,
d0_val
);
d1_reduce_op
(
d1_acc
,
d1_val
);
}
d0_m_host_result
(
m
)
=
d0_acc
;
d1_m_host_result
(
m
)
=
d1_acc
;
dxs_out_element_op
(
ck
::
Number
<
0
>
{})(
d0_acc
,
d0_acc
);
dxs_out_element_op
(
ck
::
Number
<
1
>
{})(
d1_acc
,
d1_acc
);
d0_m_host_result
(
m
)
=
ck
::
type_convert
<
DDataType
>
(
d0_acc
);
d1_m_host_result
(
m
)
=
ck
::
type_convert
<
DDataType
>
(
d1_acc
);
}
}
...
...
@@ -160,6 +193,9 @@ bool profile_gemm_reduce_impl(int do_verification,
DeviceMem
d0_device_buf
(
sizeof
(
DDataType
)
*
d0_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
d1_device_buf
(
sizeof
(
DDataType
)
*
d1_m_device_result
.
mDesc
.
GetElementSpace
());
auto
dxs_global
=
ck
::
make_tuple
(
static_cast
<
DDataType
*>
(
d0_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
d1_device_buf
.
GetDeviceBuffer
()));
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
@@ -221,8 +257,7 @@ bool profile_gemm_reduce_impl(int do_verification,
gemm_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
d0_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
d1_device_buf
.
GetDeviceBuffer
()),
dxs_global
,
M
,
N
,
K
,
...
...
@@ -232,43 +267,25 @@ bool profile_gemm_reduce_impl(int do_verification,
a_element_op
,
b_element_op
,
c_element_op
,
d
0_reduce
_op
,
d
1_reduce
_op
);
d
xs_in_element
_op
,
d
xs_out_element
_op
);
auto
invoker_ptr
=
gemm_ptr
->
MakeInvokerPointer
();
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
// warm up
invoker_ptr
->
Run
(
argument_ptr
.
get
());
// timing
float
total_time
=
0
;
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
KernelTimer
timer
;
timer
.
Start
();
invoker_ptr
->
Run
(
argument_ptr
.
get
());
timer
.
End
();
total_time
+=
timer
.
GetElapsedTime
();
}
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
float
ave_time
=
total_time
/
nrepeat
;
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
M
+
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
+
sizeof
(
CDataType
)
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
a3b4c5cb
...
...
@@ -43,19 +43,20 @@ namespace profiler {
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
void
profile_grouped_gemm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
int
nrepeat
,
std
::
vector
<
int
>
Ms
,
std
::
vector
<
int
>
Ns
,
std
::
vector
<
int
>
Ks
,
std
::
vector
<
int
>
StrideAs
,
std
::
vector
<
int
>
StrideBs
,
std
::
vector
<
int
>
StrideCs
)
bool
time_kernel
,
const
std
::
vector
<
int
>
&
Ms
,
const
std
::
vector
<
int
>
&
Ns
,
const
std
::
vector
<
int
>
&
Ks
,
const
std
::
vector
<
int
>
&
StrideAs
,
const
std
::
vector
<
int
>
&
StrideBs
,
const
std
::
vector
<
int
>
&
StrideCs
)
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
...
...
@@ -71,7 +72,7 @@ void profile_grouped_gemm_impl(int do_verification,
}
};
in
t
group_count
=
Ms
.
size
();
std
::
size_
t
group_count
=
Ms
.
size
();
if
(
!
(
group_count
==
Ns
.
size
()
&&
group_count
==
Ks
.
size
()
&&
group_count
==
StrideAs
.
size
()
&&
group_count
==
StrideBs
.
size
()
&&
group_count
==
StrideCs
.
size
()))
...
...
@@ -83,7 +84,7 @@ void profile_grouped_gemm_impl(int do_verification,
std
::
vector
<
Tensor
<
BDataType
>>
b_k_n
;
std
::
vector
<
Tensor
<
CDataType
>>
c_m_n_device_results
;
for
(
in
t
i
=
0
;
i
<
Ms
.
size
()
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
group_count
;
i
++
)
{
a_m_k
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ks
[
i
],
StrideAs
[
i
],
ALayout
{})));
...
...
@@ -144,7 +145,7 @@ void profile_grouped_gemm_impl(int do_verification,
gemm_shapes
.
reserve
(
group_count
);
for
(
in
t
i
=
0
;
i
<
group_count
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
group_count
;
i
++
)
{
a_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSpace
()));
...
...
@@ -231,10 +232,11 @@ void profile_grouped_gemm_impl(int do_verification,
{
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
in
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
flop
+=
std
::
size_t
(
2
)
*
Ms
[
i
]
*
Ns
[
i
]
*
Ks
[
i
];
...
...
@@ -258,7 +260,7 @@ void profile_grouped_gemm_impl(int do_verification,
if
(
do_verification
)
{
for
(
in
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
c_device_buf
[
i
]
->
FromDevice
(
c_m_n_device_results
[
i
].
mData
.
data
());
...
...
@@ -270,6 +272,7 @@ void profile_grouped_gemm_impl(int do_verification,
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
...
...
profiler/include/profile_reduce_impl.hpp
View file @
a3b4c5cb
...
...
@@ -5,74 +5,77 @@
#include "device_reduce_instance.hpp"
#include "reduction_enums.hpp"
#include "host_reduction.hpp"
#include "host_common_util.hpp"
#include "host_tensor_generator.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
int
NanOpt
,
int
IndicesOpt
>
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
bool
PropagateNan
,
bool
UseIndex
>
struct
ReduceDescription
{
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
NumReduceDim_
=
NumReduceDim
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
NanOpt_
=
NanOpt
;
static
constexpr
int
IndicesOpt_
=
IndicesOpt
;
static
constexpr
int
PropagateNan_
=
PropagateNan
;
static
constexpr
int
UseIndex_
=
UseIndex
;
};
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
0
,
0
>
,
// for ADD
ReduceDescription
<
4
,
4
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
5
,
0
,
0
>
,
// for AVG
ReduceDescription
<
4
,
4
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
7
,
0
,
0
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
0
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
0
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
0
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
1
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
1
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
1
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
4
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
1
>>
;
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
false
,
false
>
,
// for ADD
ReduceDescription
<
4
,
4
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
5
,
false
,
false
>
,
// for AVG
ReduceDescription
<
4
,
4
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
7
,
false
,
false
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
false
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
false
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
false
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
true
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
true
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
true
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
true
>>
;
template
<
typename
DescriptionType
>
bool
description_match
(
const
DescriptionType
&
description
,
int
Rank
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
)
bool
Propagat
eNan
,
bool
UseIndex
)
{
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
static_cast
<
int
>
(
ReduceOpId
)
||
description
.
Nan
Opt
_
!=
static_cast
<
int
>
(
Nan
Opt
)
||
description
.
IndicesOpt
_
!=
static_cast
<
int
>
(
IndicesOpt
))
description
.
Propagate
Nan_
!=
static_cast
<
int
>
(
Propagate
Nan
)
||
description
.
UseIndex
_
!=
static_cast
<
int
>
(
UseIndex
))
return
(
false
);
if
(
DescriptionType
::
NumReduceDim_
!=
reduceDims
.
size
())
...
...
@@ -116,48 +119,18 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return
invariantDims
;
};
template
<
typename
T
>
static
void
dumpBufferToFile
(
const
char
*
fileName
,
T
*
data
,
size_t
dataNumItems
)
{
std
::
ofstream
outFile
(
fileName
,
std
::
ios
::
binary
);
if
(
outFile
)
{
outFile
.
write
(
reinterpret_cast
<
char
*>
(
data
),
dataNumItems
*
sizeof
(
T
));
outFile
.
close
();
std
::
cout
<<
"Write output to file "
<<
fileName
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Could not open file "
<<
fileName
<<
" for writing"
<<
std
::
endl
;
}
};
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template
<
typename
InType
>
struct
type_mapping
{
using
OutType
=
InType
;
};
template
<
>
struct
type_mapping
<
ck
::
half_t
>
{
using
OutType
=
half_float
::
half
;
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
profile_reduce_impl_impl
(
bool
do_verification
,
bool
Propagat
eNan
,
bool
UseIndex
>
bool
profile_reduce_impl_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
int
nrepeat
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
float
alpha
,
...
...
@@ -165,16 +138,13 @@ void profile_reduce_impl_impl(bool do_verification,
{
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
host_reduc
e
;
using
ck
::
host_common
::
dumpBufferToFil
e
;
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
));
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
PROPAGATE_NAN
);
constexpr
bool
OutputIndex
=
(
op_support_indices
&&
UseIndex
);
constexpr
bool
out_support_atomic_add
=
std
::
is_same
<
OutDataType
,
float
>::
value
;
constexpr
bool
op_support_atomic_add
=
...
...
@@ -195,8 +165,7 @@ void profile_reduce_impl_impl(bool do_verification,
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
// 1) The indices can only be used when the reduction operation is indexable
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
UseIndex
);
// 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
// 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
...
...
@@ -219,6 +188,8 @@ void profile_reduce_impl_impl(bool do_verification,
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
||
invalid_reduce_4
||
invalid_reduce_5
||
invalid_reduce_6
);
bool
pass
=
true
;
if
constexpr
(
!
invalid_reduce
)
{
Tensor
<
InDataType
>
in
(
inLengths
);
...
...
@@ -282,42 +253,26 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
size_t
indicesSizeInBytes
=
NeedIndices
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
DeviceMem
out_indices_dev
(
indicesSizeInBytes
);
float
best_avg_time
=
0
;
float
best_gb_per_sec
=
0
;
using
InElementwiseOperation
_0
=
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
_0
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
InElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
;
using
AccElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
;
using
InElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
DeviceReduceInstPtr0
=
DeviceReducePtr
<
InElementwiseOperation_0
,
AccElementwiseOperation_0
>
;
using
DeviceReduceInstPtr1
=
DeviceReducePtr
<
InElementwiseOperation_1
,
AccElementwiseOperation_1
>
;
using
DeviceReduceInstPtr2
=
DeviceReducePtr
<
InElementwiseOperation_2
,
AccElementwiseOperation_2
>
;
DeviceReducePtr
<
InElementwiseOperation
,
AccElementwiseOperation
>
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
std
::
vector
<
DeviceReduceInstPtr1
>
reduce1_ptrs
;
std
::
vector
<
DeviceReduceInstPtr2
>
reduce2_ptrs
;
add_device_reduce_instance_threadwise
<
InDataType
,
AccDataType
,
...
...
@@ -325,8 +280,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
add_device_reduce_instance_blockwise
<
InDataType
,
AccDataType
,
...
...
@@ -334,8 +289,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
if
constexpr
(
use_atomic_add
)
{
...
...
@@ -345,35 +300,11 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
}
else
{
add_device_reduce_instance_multiblock_partial_reduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce1_ptrs
);
};
// used for secondary reduction
if
constexpr
(
!
use_atomic_add
)
{
add_device_reduce_instance_blockwise_second_call
<
AccDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce2_ptrs
);
};
if
(
reduce0_ptrs
.
empty
()
&&
reduce1_ptrs
.
empty
()
)
if
(
reduce0_ptrs
.
empty
())
{
throw
std
::
runtime_error
(
"Wrong! No device REDUCE instance found"
);
};
...
...
@@ -383,31 +314,34 @@ void profile_reduce_impl_impl(bool do_verification,
ReductionHost
<
InDataType
,
AccDataType
,
OutDataType
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
Rank
,
NumReduceDim
,
PropagateNan
,
NeedIndices
>
OutputIndex
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
};
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
const
auto
i_outLengths
=
to_int_vector
(
outLengths
);
const
auto
i_outStrides
=
to_int_vector
(
outStrides
);
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
std
::
vector
<
ck
::
index_t
>
i_inStrides
;
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
InElementwiseOperation_0
in_elementwise_op_0
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_0
acc_elementwise_op_0
(
static_cast
<
int32_t
>
(
reduce_total_length
));
InElementwiseOperation
in_elementwise_op
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation
acc_elementwise_op
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
...
...
@@ -417,11 +351,11 @@ void profile_reduce_impl_impl(bool do_verification,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_0
,
acc_elementwise_op_0
);
in_elementwise_op
,
acc_elementwise_op
);
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
...
...
@@ -430,7 +364,8 @@ void profile_reduce_impl_impl(bool do_verification,
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
...
...
@@ -438,8 +373,9 @@ void profile_reduce_impl_impl(bool do_verification,
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
...
...
@@ -449,22 +385,24 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
do_verification
)
{
bool
single_pass
;
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
single_pass
=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
if
(
OutputIndex
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
)
;
};
if
(
do_log
)
if
(
!
single_pass
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
;
std
::
cout
<<
"Fail Info: "
<<
reduce_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
pass
=
pass
&&
single_pass
;
};
if
(
do_dumpout
)
...
...
@@ -473,7 +411,7 @@ void profile_reduce_impl_impl(bool do_verification,
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
if
(
OutputIndex
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
...
...
@@ -485,156 +423,34 @@ void profile_reduce_impl_impl(bool do_verification,
};
};
for
(
auto
&
reduce_ptr
:
reduce1_ptrs
)
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
InElementwiseOperation_1
in_elementwise_op_1
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_1
acc_elementwise_op_1
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_1
,
acc_elementwise_op_1
);
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
std
::
string
reduce_name
=
reduce_ptr
->
GetTypeString
();
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
std
::
vector
<
int
>
inLengths2
=
reduce_ptr
->
GetWorkspace2dLengths
(
argument_ptr
.
get
());
std
::
vector
<
int
>
inStrides2
{
inLengths2
[
1
],
1
};
for
(
auto
&
reduce2_ptr
:
reduce2_ptrs
)
{
InElementwiseOperation_2
in_elementwise_op_2
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_2
acc_elementwise_op_2
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument2_ptr
=
reduce2_ptr
->
MakeArgumentPointer
(
inLengths2
,
inStrides2
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
ws_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_2
,
acc_elementwise_op_2
);
if
(
!
reduce2_ptr
->
IsSupportedArgument
(
argument2_ptr
.
get
()))
continue
;
std
::
string
reduce2_name
=
reduce2_ptr
->
GetTypeString
();
auto
invoker2_ptr
=
reduce2_ptr
->
MakeInvokerPointer
();
float
avg_time_2
=
invoker2_ptr
->
Run
(
argument2_ptr
.
get
(),
nrepeat
);
std
::
size_t
num_bytes_2
=
static_cast
<
size_t
>
(
inLengths2
[
0
])
*
inLengths2
[
1
]
*
sizeof
(
AccDataType
);
float
gb_per_sec
=
(
num_bytes
+
num_bytes_2
)
/
1.E6
/
(
avg_time
+
avg_time_2
);
std
::
cout
<<
"Perf: "
<<
(
avg_time
+
avg_time_2
)
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
" => "
<<
reduce2_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
best_avg_time
=
avg_time
+
avg_time_2
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
};
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
}
if
(
do_dumpout
)
{
dumpBufferToFile
(
"dump_in.bin"
,
in
.
mData
.
data
(),
in
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
out_indices
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_indices_host.bin"
,
out_indices_ref
.
mData
.
data
(),
out_indices_ref
.
mDesc
.
GetElementSize
());
};
};
};
};
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
time_kernel
)
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"The requested reduction operation is not supported, please check !!!"
<<
std
::
endl
;
};
return
pass
;
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
>
void
profile_reduce_impl
(
bool
do_verification
,
bool
profile_reduce_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
int
nrepeat
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
,
bool
Propagat
eNan
,
bool
UseIndex
,
float
alpha
,
float
beta
)
{
bool
matched
=
false
;
bool
pass
=
true
;
using
tuple_of_description_instances
=
tensor_operation
::
device
::
device_reduce_instance
::
reduce_description_instances
;
...
...
@@ -648,29 +464,30 @@ void profile_reduce_impl(bool do_verification,
using
descType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
if
(
!
description_match
(
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
NanOpt
,
IndicesOpt
))
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
PropagateNan
,
UseIndex
))
return
;
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
NanPropagation
>
(
descType
::
NanOpt_
),
static_cast
<
ReduceTensorIndices
>
(
descType
::
IndicesOpt_
)
>
(
do_verification
,
init_method
,
do_log
,
do_dumpout
,
nrepeat
,
inLengths
,
reduceDims
,
alpha
,
beta
);
pass
=
pass
&&
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
bool
>
(
descType
::
PropagateNan_
),
static_cast
<
bool
>
(
descType
::
UseIndex_
)
>
(
do_verification
,
init_method
,
do_dumpout
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
matched
=
true
;
});
return
pass
;
};
}
// namespace profiler
...
...
profiler/src/profile_batched_gemm.cpp
View file @
a3b4c5cb
...
...
@@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[])
printf
(
" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount
\n
"
);
exit
(
1
);
}
...
...
@@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
...
@@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -396,5 +396,5 @@ int profile_batched_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_batched_gemm_reduce.cpp
View file @
a3b4c5cb
...
...
@@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount
\n
"
);
printf
(
"arg15: split k into mulitiple batch
\n
"
);
exit
(
1
);
...
...
@@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
...
@@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -149,5 +149,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_bwd_data.cpp
deleted
100644 → 0
View file @
48918ab9
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_conv_bwd_data_impl.hpp"
enum
struct
ConvDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
};
enum
struct
ConvInputLayout
{
NCHW
,
// 0
NHWC
,
// 1
};
enum
struct
ConvWeightLayout
{
KCYX
,
// 0
KYXC
,
// 1
};
enum
struct
ConvOutputLayout
{
NKHW
,
// 0
NHWK
,
// 1
};
int
profile_conv_bwd_data
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
25
)
{
printf
(
"arg1: tensor operation (conv_bwd: BackwardConvolution)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16)
\n
"
);
printf
(
"arg3: input tensor layout (0: NCHW; 1: NHWC)
\n
"
);
printf
(
"arg4: weight tensor layout (0: KCYX; 1: KYXC)
\n
"
);
printf
(
"arg5: output tensor layout (0: NKHW; 1: NHWK)
\n
"
);
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9: run kernel # of times (>1)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
ConvDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
in_layout
=
static_cast
<
ConvInputLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
auto
wei_layout
=
static_cast
<
ConvWeightLayout
>
(
std
::
stoi
(
argv
[
4
]));
const
auto
out_layout
=
static_cast
<
ConvOutputLayout
>
(
std
::
stoi
(
argv
[
5
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
const
ck
::
index_t
C
=
std
::
stoi
(
argv
[
12
]);
const
ck
::
index_t
Y
=
std
::
stoi
(
argv
[
13
]);
const
ck
::
index_t
X
=
std
::
stoi
(
argv
[
14
]);
const
ck
::
index_t
Hi
=
std
::
stoi
(
argv
[
15
]);
const
ck
::
index_t
Wi
=
std
::
stoi
(
argv
[
16
]);
const
ck
::
index_t
conv_stride_h
=
std
::
stoi
(
argv
[
17
]);
const
ck
::
index_t
conv_stride_w
=
std
::
stoi
(
argv
[
18
]);
const
ck
::
index_t
conv_dilation_h
=
std
::
stoi
(
argv
[
19
]);
const
ck
::
index_t
conv_dilation_w
=
std
::
stoi
(
argv
[
20
]);
const
ck
::
index_t
in_left_pad_h
=
std
::
stoi
(
argv
[
21
]);
const
ck
::
index_t
in_left_pad_w
=
std
::
stoi
(
argv
[
22
]);
const
ck
::
index_t
in_right_pad_h
=
std
::
stoi
(
argv
[
23
]);
const
ck
::
index_t
in_right_pad_w
=
std
::
stoi
(
argv
[
24
]);
const
ck
::
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
ck
::
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
if
(
data_type
==
ConvDataType
::
F32_F32_F32
&&
in_layout
==
ConvInputLayout
::
NHWC
&&
wei_layout
==
ConvWeightLayout
::
KYXC
&&
out_layout
==
ConvOutputLayout
::
NHWK
)
{
ck
::
profiler
::
profile_conv_bwd_data_impl
<
2
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
N
,
K
,
C
,
std
::
vector
<
ck
::
index_t
>
{
Hi
,
Wi
},
std
::
vector
<
ck
::
index_t
>
{
Y
,
X
},
std
::
vector
<
ck
::
index_t
>
{
Ho
,
Wo
},
std
::
vector
<
ck
::
index_t
>
{
conv_stride_h
,
conv_stride_w
},
std
::
vector
<
ck
::
index_t
>
{
conv_dilation_h
,
conv_dilation_w
},
std
::
vector
<
ck
::
index_t
>
{
in_left_pad_h
,
in_left_pad_w
},
std
::
vector
<
ck
::
index_t
>
{
in_right_pad_h
,
in_right_pad_w
});
}
else
if
(
data_type
==
ConvDataType
::
F16_F16_F16
&&
in_layout
==
ConvInputLayout
::
NHWC
&&
wei_layout
==
ConvWeightLayout
::
KYXC
&&
out_layout
==
ConvOutputLayout
::
NHWK
)
{
ck
::
profiler
::
profile_conv_bwd_data_impl
<
2
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
N
,
K
,
C
,
std
::
vector
<
ck
::
index_t
>
{
Hi
,
Wi
},
std
::
vector
<
ck
::
index_t
>
{
Y
,
X
},
std
::
vector
<
ck
::
index_t
>
{
Ho
,
Wo
},
std
::
vector
<
ck
::
index_t
>
{
conv_stride_h
,
conv_stride_w
},
std
::
vector
<
ck
::
index_t
>
{
conv_dilation_h
,
conv_dilation_w
},
std
::
vector
<
ck
::
index_t
>
{
in_left_pad_h
,
in_left_pad_w
},
std
::
vector
<
ck
::
index_t
>
{
in_right_pad_h
,
in_right_pad_w
});
}
else
if
(
data_type
==
ConvDataType
::
BF16_BF16_BF16
&&
in_layout
==
ConvInputLayout
::
NHWC
&&
wei_layout
==
ConvWeightLayout
::
KYXC
&&
out_layout
==
ConvOutputLayout
::
NHWK
)
{
ck
::
profiler
::
profile_conv_bwd_data_impl
<
2
,
uint16_t
,
uint16_t
,
uint16_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
N
,
K
,
C
,
std
::
vector
<
ck
::
index_t
>
{
Hi
,
Wi
},
std
::
vector
<
ck
::
index_t
>
{
Y
,
X
},
std
::
vector
<
ck
::
index_t
>
{
Ho
,
Wo
},
std
::
vector
<
ck
::
index_t
>
{
conv_stride_h
,
conv_stride_w
},
std
::
vector
<
ck
::
index_t
>
{
conv_dilation_h
,
conv_dilation_w
},
std
::
vector
<
ck
::
index_t
>
{
in_left_pad_h
,
in_left_pad_w
},
std
::
vector
<
ck
::
index_t
>
{
in_right_pad_h
,
in_right_pad_w
});
}
else
if
(
data_type
==
ConvDataType
::
INT8_INT8_INT8
&&
in_layout
==
ConvInputLayout
::
NHWC
&&
wei_layout
==
ConvWeightLayout
::
KYXC
&&
out_layout
==
ConvOutputLayout
::
NHWK
)
{
ck
::
profiler
::
profile_conv_bwd_data_impl
<
2
,
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
N
,
K
,
C
,
std
::
vector
<
ck
::
index_t
>
{
Hi
,
Wi
},
std
::
vector
<
ck
::
index_t
>
{
Y
,
X
},
std
::
vector
<
ck
::
index_t
>
{
Ho
,
Wo
},
std
::
vector
<
ck
::
index_t
>
{
conv_stride_h
,
conv_stride_w
},
std
::
vector
<
ck
::
index_t
>
{
conv_dilation_h
,
conv_dilation_w
},
std
::
vector
<
ck
::
index_t
>
{
in_left_pad_h
,
in_left_pad_w
},
std
::
vector
<
ck
::
index_t
>
{
in_right_pad_h
,
in_right_pad_w
});
}
else
{
throw
std
::
runtime_error
(
"wrong! this Conv data_type & layout is not implemented"
);
}
return
1
;
}
profiler/src/profile_conv_bwd_weight.cpp
View file @
a3b4c5cb
...
...
@@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
...
...
@@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
N
,
K
,
C
,
...
...
@@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
N
,
K
,
C
,
...
...
@@ -142,5 +142,5 @@ int profile_conv_bwd_weight(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this Conv data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu.cpp
View file @
a3b4c5cb
...
...
@@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg9:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
...
...
@@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
...
...
@@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
N
,
K
,
C
,
...
...
@@ -110,5 +110,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu_add.cpp
View file @
a3b4c5cb
...
...
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg9:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
...
...
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
...
...
@@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
N
,
K
,
C
,
...
...
@@ -111,5 +111,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
View file @
a3b4c5cb
...
...
@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg9:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
...
...
@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
ck
::
index_t
N
=
std
::
stoi
(
argv
[
10
]);
const
ck
::
index_t
K
=
std
::
stoi
(
argv
[
11
]);
...
...
@@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
N
,
K
,
C
,
...
...
@@ -112,5 +112,5 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_convnd_bwd_data.cpp
View file @
a3b4c5cb
...
...
@@ -39,40 +39,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[],
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
num_dim_spatial
_
=
num_dim_spatial
;
params
.
N
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
filter_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
input_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
params
.
conv_filter_strides
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
params
.
conv_filter_dilations
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
params
.
input_left_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
params
.
input_right_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
...
...
@@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
printf
(
"arg6: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg7: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg9:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg9:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
return
1
;
...
...
@@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
int
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
ck
::
utils
::
conv
::
ConvParams
params
=
parse_conv_params
(
num_dim_spatial
,
argv
,
preParams
);
...
...
@@ -132,17 +132,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification
,
init_method
,
do_log
,
nrepeat
,
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
time_kernel
,
params
.
N
_
,
params
.
K
_
,
params
.
C
_
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
_
,
params
.
GetOutputSpatialLengths
(),
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
);
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
_
,
params
.
input_right_pads
_
);
break
;
case
2
:
...
...
@@ -157,17 +157,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification
,
init_method
,
do_log
,
nrepeat
,
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
time_kernel
,
params
.
N
_
,
params
.
K
_
,
params
.
C
_
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
_
,
params
.
GetOutputSpatialLengths
(),
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
);
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
_
,
params
.
input_right_pads
_
);
break
;
case
3
:
...
...
@@ -182,17 +182,17 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification
,
init_method
,
do_log
,
nrepeat
,
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
time_kernel
,
params
.
N
_
,
params
.
K
_
,
params
.
C
_
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
_
,
params
.
GetOutputSpatialLengths
(),
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
);
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
_
,
params
.
input_right_pads
_
);
break
;
default:
break
;
...
...
profiler/src/profile_convnd_fwd.cpp
View file @
a3b4c5cb
...
...
@@ -5,7 +5,7 @@
#include <vector>
#include <half.hpp>
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
#include "element_wise_operation.hpp"
#include "fill.hpp"
#include "profile_convnd_fwd.hpp"
...
...
@@ -119,7 +119,7 @@ template <int NDim,
void
profile_convnd_instances_impl
(
const
ck
::
utils
::
conv
::
ConvParams
&
params
,
bool
do_verification
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
int
init_method
,
ConvLayouts
)
{
...
...
@@ -185,7 +185,7 @@ void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
reference_conv_fwd_fun
);
auto
best_conf
=
run_engine
.
Profile
(
conv
::
ConvolutionFwdInstances
<
InDataType
,
WeiDataType
,
OutDataType
>::
template
Get
<
NDim
>(),
nrepeat
,
time_kernel
,
do_verification
,
do_log
);
...
...
@@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type,
const
ck
::
utils
::
conv
::
ConvParams
&
params
,
bool
do_verification
,
bool
do_log
,
int
nrepeat
,
bool
time_kernel
,
int
init_method
)
{
switch
(
data_layout
)
...
...
@@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
do_verification
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
break
;
...
...
@@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
do_verification
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
break
;
...
...
@@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
do_verification
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
break
;
...
...
@@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
do_verification
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NHWC
>
{});
break
;
...
...
@@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
do_verification
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
break
;
...
...
@@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
do_verification
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
break
;
...
...
@@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
do_verification
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
break
;
...
...
@@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type,
params
,
do_verification
,
do_log
,
nrepeat
,
time_kernel
,
init_method
,
ConvolutionLayouts
<
NDim
,
ConvDataLayout
::
NCHW
>
{});
break
;
...
...
@@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
bool
do_verification
{
true
};
int
init_method
{
2
};
bool
do_log
{
false
};
int
nrepeat
{
100
};
bool
time_kernel
{
false
};
int
num_dim_spatial
{
2
};
ConvParams
params
;
...
...
@@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
do_verification
=
std
::
stoi
(
argv
[
4
]);
init_method
=
std
::
stoi
(
argv
[
5
]);
do_log
=
std
::
stoi
(
argv
[
6
]);
nrepeat
=
std
::
stoi
(
argv
[
7
]);
time_kernel
=
std
::
stoi
(
argv
[
7
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
8
]);
}
if
(
argc
>=
10
)
...
...
@@ -332,20 +332,20 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
{
case
1
:
profile_convnd_instances
<
1
>
(
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
nrepeat
,
init_method
);
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
time_kernel
,
init_method
);
break
;
case
2
:
profile_convnd_instances
<
2
>
(
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
nrepeat
,
init_method
);
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
time_kernel
,
init_method
);
break
;
case
3
:
profile_convnd_instances
<
3
>
(
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
nrepeat
,
init_method
);
data_type
,
data_layout
,
params
,
do_verification
,
do_log
,
time_kernel
,
init_method
);
break
;
default:
throw
std
::
runtime_error
(
"profile_conv_fwd: unsupported num_dim_spatial value: "
+
std
::
to_string
(
num_dim_spatial
));
}
return
1
;
return
0
;
}
profiler/src/profile_gemm.cpp
View file @
a3b4c5cb
...
...
@@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
...
...
@@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
...
@@ -68,13 +68,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -88,13 +89,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -108,13 +110,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -128,13 +131,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -146,6 +150,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
...
@@ -154,7 +159,7 @@ int profile_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -166,6 +171,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
...
@@ -174,7 +180,7 @@ int profile_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -186,6 +192,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
...
@@ -194,7 +201,7 @@ int profile_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -206,6 +213,7 @@ int profile_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_gemm_impl
<
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
...
@@ -214,7 +222,7 @@ int profile_gemm(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -228,13 +236,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -248,13 +257,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -268,13 +278,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -288,13 +299,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
int8_t
,
int8_t
,
int8_t
,
int32_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -308,13 +320,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -328,13 +341,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -348,13 +362,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -368,13 +383,14 @@ int profile_gemm(int argc, char* argv[])
ck
::
profiler
::
profile_gemm_impl
<
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -388,5 +404,5 @@ int profile_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_2d.cpp
View file @
a3b4c5cb
...
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: alpha
\n
"
);
printf
(
"arg15: beta
\n
"
);
...
...
@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
...
@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_relu.cpp
View file @
a3b4c5cb
...
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
...
...
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
...
@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_relu_add.cpp
View file @
a3b4c5cb
...
...
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1
\n
"
);
printf
(
"arg15: split k into mulitiple batch
\n
"
);
exit
(
1
);
...
...
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
...
@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_reduce.cpp
View file @
a3b4c5cb
...
...
@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
...
...
@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
...
...
@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
M
,
N
,
K
,
...
...
@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_grouped_gemm.cpp
View file @
a3b4c5cb
...
...
@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg
6
: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)
\n
"
);
exit
(
1
);
...
...
@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
auto
Ms
=
argToIntArray
(
argv
[
8
]);
const
auto
Ns
=
argToIntArray
(
argv
[
9
]);
...
...
@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
...
@@ -86,7 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
Ms
,
Ns
,
Ks
,
...
...
@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
...
...
@@ -104,7 +106,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
Ms
,
Ns
,
Ks
,
...
...
@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
...
@@ -122,7 +125,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
Ms
,
Ns
,
Ks
,
...
...
@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
...
...
@@ -140,7 +144,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
time_kernel
,
Ms
,
Ns
,
Ks
,
...
...
@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_reduce.cpp
View file @
a3b4c5cb
#include <iostream>
#include <fstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <getopt.h>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "data_type_enum.hpp"
#include "reduction_enums.hpp"
#include "host_common_util.hpp"
#include "profile_reduce_impl.hpp"
using
namespace
std
;
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorIndices
;
using
ck
::
ReduceTensorOp
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
...
...
@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{
"bf16"
,
no_argument
,
nullptr
,
'?'
},
{
"dumpout"
,
required_argument
,
nullptr
,
'o'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"log"
,
required_argument
,
nullptr
,
'l'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
template
<
typename
T
>
static
T
getSingleValueFromString
(
const
string
&
valueStr
)
{
std
::
istringstream
iss
(
valueStr
);
T
val
;
iss
>>
val
;
return
(
val
);
};
template
<
typename
T
>
static
std
::
vector
<
T
>
getTypeValuesFromString
(
const
char
*
cstr_values
)
{
std
::
string
valuesStr
(
cstr_values
);
std
::
vector
<
T
>
values
;
std
::
size_t
pos
=
0
;
std
::
size_t
new_pos
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
while
(
new_pos
!=
std
::
string
::
npos
)
{
const
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
,
new_pos
-
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
pos
=
new_pos
+
1
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
};
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
return
(
values
);
}
enum
struct
AppDataType
{
appHalf
=
0
,
appFloat
=
1
,
appInt32
=
2
,
appInt8
=
3
,
appInt8x4
=
4
,
appBFloat16
=
5
,
appDouble
=
6
,
};
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
reduceDims
)
{
for
(
auto
dim
:
reduceDims
)
...
...
@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
};
};
class
App
Args
class
ReduceProfiler
Args
{
private:
int
option_index
=
0
;
...
...
@@ -130,26 +68,23 @@ class AppArgs
std
::
vector
<
float
>
scales
;
ReduceTensorOp
reduceOp
=
ReduceTensorOp
::
ADD
;
App
DataType
compTypeId
=
App
DataType
::
app
Float
;
App
DataType
outTypeId
=
App
DataType
::
app
Float
;
ReduceTensorOp
reduceOp
=
ReduceTensorOp
::
ADD
;
ck
::
DataType
Enum
compTypeId
=
ck
::
DataType
Enum
::
Float
;
ck
::
DataType
Enum
outTypeId
=
ck
::
DataType
Enum
::
Float
;
bool
compType_assigned
=
false
;
bool
outType_assigned
=
false
;
NanPropagation
nanOpt
=
NanPropagation
::
NOT_PROPAGATE_NAN
;
ReduceTensorIndices
indicesOpt
=
ReduceTensorIndices
::
NO_INDICES
;
bool
do_log
=
false
;
bool
do_verification
=
false
;
bool
do_dumpout
=
false
;
int
nanOpt
=
0
;
int
indicesOpt
=
0
;
bool
do_verification
=
false
;
bool
do_dumpout
=
false
;
int
init_method
;
int
nrepeat
;
bool
time_kernel
;
bool
need_indices
=
false
;
AppArgs
()
=
default
;
~
AppArgs
()
=
default
;
ReduceProfilerArgs
()
=
default
;
~
ReduceProfilerArgs
()
=
default
;
void
show_usage
(
const
char
*
cmd
)
{
...
...
@@ -166,8 +101,11 @@ class AppArgs
std
::
cout
<<
"--outType or -W, optional enum value indicating the type of the reduced "
"output, which could be float when the input data is half"
<<
std
::
endl
;
std
::
cout
<<
"--nanOpt or -N, enum value indicates the selection for NanOpt"
<<
std
::
endl
;
std
::
cout
<<
"--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
std
::
cout
<<
"--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
<<
std
::
endl
;
std
::
cout
<<
"--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
"index in reduction"
<<
std
::
endl
;
std
::
cout
<<
"--scales or -S, comma separated two float values for alpha and beta"
<<
std
::
endl
;
...
...
@@ -181,18 +119,19 @@ class AppArgs
std
::
cout
<<
"--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
"for further analysis"
<<
std
::
endl
;
std
::
cout
<<
"--log or -l, 1/0 to indicate whether to log some information"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
{
unsigned
int
ch
;
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
optind
++
;
// to skip the "reduce" module name
while
(
1
)
{
ch
=
getopt_long
(
argc
,
argv
,
"D:R:O:C:W:N:I:S:v:o:
l:
"
,
long_options
,
&
option_index
);
ch
=
getopt_long
(
argc
,
argv
,
"D:R:O:C:W:N:I:S:v:o:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
break
;
switch
(
ch
)
...
...
@@ -219,27 +158,27 @@ class AppArgs
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
compTypeId
=
static_cast
<
App
DataType
>
(
std
::
atoi
(
optarg
));
compTypeId
=
static_cast
<
ck
::
DataType
Enum
>
(
std
::
atoi
(
optarg
));
compType_assigned
=
true
;
break
;
case
'W'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
outTypeId
=
static_cast
<
App
DataType
>
(
std
::
atoi
(
optarg
));
outTypeId
=
static_cast
<
ck
::
DataType
Enum
>
(
std
::
atoi
(
optarg
));
outType_assigned
=
true
;
break
;
case
'N'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
nanOpt
=
static_cast
<
NanPropagation
>
(
std
::
atoi
(
optarg
)
)
;
nanOpt
=
std
::
atoi
(
optarg
);
break
;
case
'I'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
indicesOpt
=
static_cast
<
ReduceTensorIndices
>
(
std
::
atoi
(
optarg
)
)
;
indicesOpt
=
std
::
atoi
(
optarg
);
break
;
case
'S'
:
if
(
!
optarg
)
...
...
@@ -262,12 +201,6 @@ class AppArgs
do_dumpout
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
break
;
case
'l'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
do_log
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
break
;
case
'?'
:
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"half"
)
use_half
=
true
;
...
...
@@ -295,7 +228,7 @@ class AppArgs
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
nrepeat
=
std
::
atoi
(
argv
[
optind
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
])
)
;
if
(
scales
.
empty
())
{
...
...
@@ -306,9 +239,6 @@ class AppArgs
if
(
reduceOp
==
ReduceTensorOp
::
MIN
||
reduceOp
==
ReduceTensorOp
::
MAX
||
reduceOp
==
ReduceTensorOp
::
AMAX
)
{
if
(
indicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
)
need_indices
=
true
;
// for indexable operations, no need to assign compType and outType, just let them be
// same as inType
compType_assigned
=
false
;
...
...
@@ -322,9 +252,10 @@ class AppArgs
int
profile_reduce
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
profiler
;
using
ck
::
DataTypeEnum
;
using
ck
::
profiler
::
profile_reduce_impl
;
App
Args
args
;
ReduceProfiler
Args
args
;
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
return
(
-
1
);
...
...
@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
if
(
args
.
use_half
)
{
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
App
DataType
::
app
Half
;
args
.
compTypeId
=
DataType
Enum
::
Half
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
Half
&&
args
.
outTypeId
!=
App
DataType
::
app
Float
))
args
.
outTypeId
=
App
DataType
::
app
Float
;
(
args
.
outTypeId
!=
DataType
Enum
::
Half
&&
args
.
outTypeId
!=
DataType
Enum
::
Float
))
args
.
outTypeId
=
DataType
Enum
::
Float
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
Half
;
args
.
outTypeId
=
DataType
Enum
::
Half
;
if
(
args
.
compTypeId
==
App
DataType
::
app
Half
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Half
)
{
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Float
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Float
)
{
profile_reduce_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
...
...
@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
{
profile_reduce_impl
<
double
,
double
,
double
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
use_int8
)
{
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
App
DataType
::
app
Int8
;
args
.
compTypeId
=
DataType
Enum
::
Int8
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
Int8
&&
args
.
outTypeId
!=
App
DataType
::
app
Int32
))
args
.
outTypeId
=
App
DataType
::
app
Int32
;
(
args
.
outTypeId
!=
DataType
Enum
::
Int8
&&
args
.
outTypeId
!=
DataType
Enum
::
Int32
))
args
.
outTypeId
=
DataType
Enum
::
Int32
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
Int8
;
args
.
outTypeId
=
DataType
Enum
::
Int8
;
if
(
args
.
compTypeId
==
App
DataType
::
app
Int8
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Int8
)
{
profile_reduce_impl
<
int8_t
,
int8_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Int32
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Int32
)
{
profile_reduce_impl
<
int8_t
,
int32_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
...
...
@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
else
if
(
args
.
use_bf16
)
{
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
BFloat16
&&
args
.
outTypeId
!=
App
DataType
::
app
Float
))
args
.
outTypeId
=
App
DataType
::
app
Float
;
(
args
.
outTypeId
!=
DataType
Enum
::
BFloat16
&&
args
.
outTypeId
!=
DataType
Enum
::
Float
))
args
.
outTypeId
=
DataType
Enum
::
Float
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
BFloat16
;
args
.
outTypeId
=
DataType
Enum
::
BFloat16
;
profile_reduce_impl
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
{
if
(
args
.
compTypeId
==
App
DataType
::
app
Float
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Float
)
{
profile_reduce_impl
<
float
,
float
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Double
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Double
)
{
profile_reduce_impl
<
float
,
double
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
...
...
profiler/src/profiler.cpp
View file @
a3b4c5cb
...
...
@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_atomic_add
(
int
,
char
*
[]);
...
...
@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
}
else
if
(
strcmp
(
argv
[
1
],
"grouped_gemm"
)
==
0
)
{
profile_grouped_gemm
(
argc
,
argv
);
return
profile_grouped_gemm
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
{
...
...
@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
" conv1d_bwd_data: BackwardConvolution data 1 dim
\n
"
" conv2d_bwd_data: BackwardConvolution data 2 dim
\n
"
" conv3d_bwd_data: BackwardConvolution data 3 dim
\n
"
" reduce: R
EDUCE
\n
"
" reduce: R
educe
\n
"
" conv2d_bwd_weight: Backward Weight Convolution 2d
\n
"
);
// clang-format on
}
...
...
Prev
1
…
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment