Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
bb1f8082
Commit
bb1f8082
authored
May 26, 2022
by
root
Browse files
Merge remote-tracking branch 'origin/develop' into myamlak/cgemm
parents
97ac5007
82d7d993
Changes
177
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
315 additions
and
550 deletions
+315
-550
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
...e_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
+0
-24
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
...ce_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
+0
-40
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+1
-0
profiler/include/profile_gemm_impl.hpp
profiler/include/profile_gemm_impl.hpp
+107
-16
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+120
-308
profiler/src/profile_batched_gemm.cpp
profiler/src/profile_batched_gemm.cpp
+1
-1
profiler/src/profile_batched_gemm_reduce.cpp
profiler/src/profile_batched_gemm_reduce.cpp
+1
-1
profiler/src/profile_conv_bwd_weight.cpp
profiler/src/profile_conv_bwd_weight.cpp
+1
-1
profiler/src/profile_conv_fwd_bias_relu.cpp
profiler/src/profile_conv_fwd_bias_relu.cpp
+1
-1
profiler/src/profile_conv_fwd_bias_relu_add.cpp
profiler/src/profile_conv_fwd_bias_relu_add.cpp
+1
-1
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
+1
-1
profiler/src/profile_convnd_fwd.cpp
profiler/src/profile_convnd_fwd.cpp
+1
-1
profiler/src/profile_gemm.cpp
profiler/src/profile_gemm.cpp
+1
-1
profiler/src/profile_gemm_bias_2d.cpp
profiler/src/profile_gemm_bias_2d.cpp
+1
-1
profiler/src/profile_gemm_bias_relu.cpp
profiler/src/profile_gemm_bias_relu.cpp
+1
-1
profiler/src/profile_gemm_bias_relu_add.cpp
profiler/src/profile_gemm_bias_relu_add.cpp
+1
-1
profiler/src/profile_gemm_reduce.cpp
profiler/src/profile_gemm_reduce.cpp
+1
-1
profiler/src/profile_grouped_gemm.cpp
profiler/src/profile_grouped_gemm.cpp
+1
-1
profiler/src/profile_reduce.cpp
profiler/src/profile_reduce.cpp
+71
-147
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+3
-2
No files found.
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.cpp
deleted
100644 → 0
View file @
97ac5007
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
3
);
// for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
0
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
3
);
// for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int32_t
,
int8_t
,
5
,
0
,
0
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.cpp
deleted
100644 → 0
View file @
97ac5007
#include "device_reduce_instance_multiblock_partial_reduce.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
0
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
3
);
// for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
2
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
3
);
// for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
3
,
0
,
1
,
2
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
3
);
// for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
4
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
4
,
1
);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID
(
int8_t
,
int8_t
,
int8_t
,
4
,
0
,
1
,
2
,
1
);
// clang-format on
}
// namespace device_reduce_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/CMakeLists.txt
View file @
bb1f8082
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include/ck
${
PROJECT_SOURCE_DIR
}
/include/ck/utility
${
PROJECT_SOURCE_DIR
}
/include/ck/host_utility
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor_description
${
PROJECT_SOURCE_DIR
}
/include/ck/tensor
${
PROJECT_SOURCE_DIR
}
/include/ck/problem_transform
...
...
profiler/include/profile_gemm_impl.hpp
View file @
bb1f8082
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include "check_err.hpp"
#include "config.hpp"
...
...
@@ -42,14 +44,10 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(std::vector<De
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
...
...
@@ -74,6 +72,21 @@ void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(std::vector<Devic
void
add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device
}
// namespace tensor_operation
...
...
@@ -125,7 +138,11 @@ void profile_gemm_impl(int do_verification,
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
break
;
// case 0: break;
case
0
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{},
num_thread
);
break
;
case
1
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
},
num_thread
);
...
...
@@ -174,6 +191,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -192,6 +212,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -210,6 +233,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -228,6 +254,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -250,6 +279,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -268,6 +300,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances
(
gemm_ptrs
);
...
...
@@ -289,6 +324,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -307,6 +345,9 @@ void profile_gemm_impl(int do_verification,
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
gemm_ptrs
);
}
...
...
@@ -353,28 +394,40 @@ void profile_gemm_impl(int do_verification,
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances
(
gemm_ptrs
);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
gemm_ptrs
);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances
(
gemm_ptrs
);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances
(
gemm_ptrs
);
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
&&
is_same
<
CLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances
(
gemm_ptrs
);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances
(
gemm_ptrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances
(
gemm_ptrs
);
}
}
...
...
@@ -523,12 +576,50 @@ void profile_gemm_impl(int do_verification,
}
else
{
std
::
cout
<<
"does not support this GEMM problem"
<<
std
::
endl
;
std
::
cout
<<
gemm_ptr
->
GetTypeString
()
<<
" does not support this GEMM problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
if
constexpr
(
is_same
<
CDataType
,
float
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = f32"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
half_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = f16"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
bhalf_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = bf16"
;
}
else
if
constexpr
(
is_same
<
CDataType
,
int8_t
>::
value
)
{
std
::
cout
<<
"Best Perf for datatype = int8"
;
}
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
std
::
cout
<<
" ALayout = RowMajor"
;
}
else
if
constexpr
(
is_same
<
ALayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
)
{
std
::
cout
<<
" ALayout = ColumnMajor"
;
}
if
constexpr
(
is_same
<
BLayout
,
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
std
::
cout
<<
" BLayout = RowMajor"
;
}
else
if
constexpr
(
is_same
<
BLayout
,
tensor_layout
::
gemm
::
ColumnMajor
>::
value
)
{
std
::
cout
<<
" BLayout = ColumnMajor"
;
}
std
::
cout
<<
" M = "
<<
M
<<
" N = "
<<
N
<<
" K = "
<<
K
<<
" StrideA = "
<<
StrideA
<<
" StrideB = "
<<
StrideB
<<
" StrideC = "
<<
StrideC
<<
" : "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
}
}
// namespace profiler
...
...
profiler/include/profile_reduce_impl.hpp
View file @
bb1f8082
...
...
@@ -5,74 +5,77 @@
#include "device_reduce_instance.hpp"
#include "reduction_enums.hpp"
#include "host_reduction.hpp"
#include "host_common_util.hpp"
#include "host_tensor_generator.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
int
NanOpt
,
int
IndicesOpt
>
template
<
int
Rank
,
int
NumReduceDim
,
int
ReduceOpId
,
bool
PropagateNan
,
bool
UseIndex
>
struct
ReduceDescription
{
static
constexpr
int
Rank_
=
Rank
;
static
constexpr
int
NumReduceDim_
=
NumReduceDim
;
static
constexpr
int
ReduceOpId_
=
ReduceOpId
;
static
constexpr
int
NanOpt_
=
NanOpt
;
static
constexpr
int
IndicesOpt_
=
IndicesOpt
;
static
constexpr
int
PropagateNan_
=
PropagateNan
;
static
constexpr
int
UseIndex_
=
UseIndex
;
};
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
0
,
0
>
,
// for ADD
ReduceDescription
<
4
,
4
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
0
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
5
,
0
,
0
>
,
// for AVG
ReduceDescription
<
4
,
4
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
5
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
7
,
0
,
0
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
7
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
0
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
0
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
0
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
0
>
,
ReduceDescription
<
4
,
3
,
2
,
0
,
1
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
2
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
3
,
0
,
1
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
3
,
0
,
1
>
,
ReduceDescription
<
4
,
3
,
4
,
0
,
1
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
0
,
1
>
,
ReduceDescription
<
4
,
1
,
4
,
0
,
1
>
,
ReduceDescription
<
2
,
1
,
4
,
0
,
1
>>
;
using
reduce_description_instances
=
std
::
tuple
<
ReduceDescription
<
4
,
3
,
0
,
false
,
false
>
,
// for ADD
ReduceDescription
<
4
,
4
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
0
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
5
,
false
,
false
>
,
// for AVG
ReduceDescription
<
4
,
4
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
5
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
7
,
false
,
false
>
,
// for NORM2
ReduceDescription
<
4
,
4
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
7
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
false
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
2
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
false
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
3
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
false
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
false
>
,
ReduceDescription
<
4
,
3
,
2
,
false
,
true
>
,
// for MIN
ReduceDescription
<
4
,
4
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
2
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
3
,
false
,
true
>
,
// for MAX
ReduceDescription
<
4
,
4
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
3
,
false
,
true
>
,
ReduceDescription
<
4
,
3
,
4
,
false
,
true
>
,
// for AMAX
ReduceDescription
<
4
,
4
,
4
,
false
,
true
>
,
ReduceDescription
<
4
,
1
,
4
,
false
,
true
>
,
ReduceDescription
<
2
,
1
,
4
,
false
,
true
>>
;
template
<
typename
DescriptionType
>
bool
description_match
(
const
DescriptionType
&
description
,
int
Rank
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
)
bool
Propagat
eNan
,
bool
UseIndex
)
{
if
(
description
.
Rank_
!=
Rank
||
description
.
ReduceOpId_
!=
static_cast
<
int
>
(
ReduceOpId
)
||
description
.
Nan
Opt
_
!=
static_cast
<
int
>
(
Nan
Opt
)
||
description
.
IndicesOpt
_
!=
static_cast
<
int
>
(
IndicesOpt
))
description
.
Propagate
Nan_
!=
static_cast
<
int
>
(
Propagate
Nan
)
||
description
.
UseIndex
_
!=
static_cast
<
int
>
(
UseIndex
))
return
(
false
);
if
(
DescriptionType
::
NumReduceDim_
!=
reduceDims
.
size
())
...
...
@@ -116,46 +119,16 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return
invariantDims
;
};
template
<
typename
T
>
static
void
dumpBufferToFile
(
const
char
*
fileName
,
T
*
data
,
size_t
dataNumItems
)
{
std
::
ofstream
outFile
(
fileName
,
std
::
ios
::
binary
);
if
(
outFile
)
{
outFile
.
write
(
reinterpret_cast
<
char
*>
(
data
),
dataNumItems
*
sizeof
(
T
));
outFile
.
close
();
std
::
cout
<<
"Write output to file "
<<
fileName
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Could not open file "
<<
fileName
<<
" for writing"
<<
std
::
endl
;
}
};
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template
<
typename
InType
>
struct
type_mapping
{
using
OutType
=
InType
;
};
template
<
>
struct
type_mapping
<
ck
::
half_t
>
{
using
OutType
=
half_float
::
half
;
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
void
profile_reduce_impl_impl
(
bool
do_verification
,
bool
Propagat
eNan
,
bool
UseIndex
>
bool
profile_reduce_impl_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
...
...
@@ -166,15 +139,13 @@ void profile_reduce_impl_impl(bool do_verification,
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
host_reduce
;
using
ck
::
host_common
::
dumpBufferToFile
;
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
));
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
PROPAGATE_NAN
);
constexpr
bool
OutputIndex
=
(
op_support_indices
&&
UseIndex
);
constexpr
bool
out_support_atomic_add
=
std
::
is_same
<
OutDataType
,
float
>::
value
;
constexpr
bool
op_support_atomic_add
=
...
...
@@ -195,8 +166,7 @@ void profile_reduce_impl_impl(bool do_verification,
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
// 1) The indices can only be used when the reduction operation is indexable
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
UseIndex
);
// 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
// 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
...
...
@@ -219,6 +189,8 @@ void profile_reduce_impl_impl(bool do_verification,
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
||
invalid_reduce_4
||
invalid_reduce_5
||
invalid_reduce_6
);
bool
pass
=
true
;
if
constexpr
(
!
invalid_reduce
)
{
Tensor
<
InDataType
>
in
(
inLengths
);
...
...
@@ -282,7 +254,7 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
size_t
indicesSizeInBytes
=
NeedIndices
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int
)
:
0
;
DeviceMem
out_indices_dev
(
indicesSizeInBytes
);
...
...
@@ -295,29 +267,11 @@ void profile_reduce_impl_impl(bool do_verification,
using
AccElementwiseOperation_0
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
InElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
InElementwiseOperation
;
using
AccElementwiseOperation_1
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
false
>::
AccElementwiseOperation
;
using
InElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation_2
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
false
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstPtr0
=
DeviceReducePtr
<
InElementwiseOperation_0
,
AccElementwiseOperation_0
>
;
using
DeviceReduceInstPtr1
=
DeviceReducePtr
<
InElementwiseOperation_1
,
AccElementwiseOperation_1
>
;
using
DeviceReduceInstPtr2
=
DeviceReducePtr
<
InElementwiseOperation_2
,
AccElementwiseOperation_2
>
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
std
::
vector
<
DeviceReduceInstPtr1
>
reduce1_ptrs
;
std
::
vector
<
DeviceReduceInstPtr2
>
reduce2_ptrs
;
add_device_reduce_instance_threadwise
<
InDataType
,
AccDataType
,
...
...
@@ -325,8 +279,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
add_device_reduce_instance_blockwise
<
InDataType
,
AccDataType
,
...
...
@@ -334,8 +288,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
if
constexpr
(
use_atomic_add
)
{
...
...
@@ -345,35 +299,11 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
ReduceOpId
,
Nan
Opt
,
IndicesOpt
>
(
reduce0_ptrs
);
Propagate
Nan
,
UseIndex
>
(
reduce0_ptrs
);
}
else
{
add_device_reduce_instance_multiblock_partial_reduce
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce1_ptrs
);
};
// used for secondary reduction
if
constexpr
(
!
use_atomic_add
)
{
add_device_reduce_instance_blockwise_second_call
<
AccDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOpId
,
NanOpt
,
IndicesOpt
>
(
reduce2_ptrs
);
};
if
(
reduce0_ptrs
.
empty
()
&&
reduce1_ptrs
.
empty
())
if
(
reduce0_ptrs
.
empty
())
{
throw
std
::
runtime_error
(
"Wrong! No device REDUCE instance found"
);
};
...
...
@@ -387,23 +317,25 @@ void profile_reduce_impl_impl(bool do_verification,
Rank
,
NumReduceDim
,
PropagateNan
,
NeedIndices
>
OutputIndex
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
};
const
auto
i_inLengths
=
to_int_vector
(
inLengths
);
const
auto
i_inStrides
=
to_int_vector
(
inStrides
);
const
auto
i_outLengths
=
to_int_vector
(
outLengths
);
const
auto
i_outStrides
=
to_int_vector
(
outStrides
);
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
std
::
vector
<
ck
::
index_t
>
i_inStrides
;
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
InElementwiseOperation_0
in_elementwise_op_0
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_0
acc_elementwise_op_0
(
...
...
@@ -417,9 +349,9 @@ void profile_reduce_impl_impl(bool do_verification,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_0
,
acc_elementwise_op_0
);
...
...
@@ -439,8 +371,9 @@ void profile_reduce_impl_impl(bool do_verification,
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
...
...
@@ -450,22 +383,24 @@ void profile_reduce_impl_impl(bool do_verification,
if
(
do_verification
)
{
bool
single_pass
;
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
single_pass
=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
if
(
OutputIndex
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
single_pass
=
single_pass
&&
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
)
;
};
if
(
do_log
)
if
(
!
single_pass
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
;
std
::
cout
<<
"Fail Info: "
<<
reduce_ptr
->
GetTypeString
()
<<
std
::
endl
;
}
pass
=
pass
&&
single_pass
;
};
if
(
do_dumpout
)
...
...
@@ -474,7 +409,7 @@ void profile_reduce_impl_impl(bool do_verification,
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
if
(
OutputIndex
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
...
...
@@ -486,158 +421,34 @@ void profile_reduce_impl_impl(bool do_verification,
};
};
for
(
auto
&
reduce_ptr
:
reduce1_ptrs
)
{
auto
wsSizeInBytes
=
reduce_ptr
->
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
InElementwiseOperation_1
in_elementwise_op_1
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_1
acc_elementwise_op_1
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_1
,
acc_elementwise_op_1
);
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
std
::
string
reduce_name
=
reduce_ptr
->
GetTypeString
();
auto
invoker_ptr
=
reduce_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
std
::
vector
<
int
>
inLengths2
=
reduce_ptr
->
GetWorkspace2dLengths
(
argument_ptr
.
get
());
std
::
vector
<
int
>
inStrides2
{
inLengths2
[
1
],
1
};
for
(
auto
&
reduce2_ptr
:
reduce2_ptrs
)
{
InElementwiseOperation_2
in_elementwise_op_2
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_2
acc_elementwise_op_2
(
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument2_ptr
=
reduce2_ptr
->
MakeArgumentPointer
(
inLengths2
,
inStrides2
,
i_outLengths
,
i_outStrides
,
reduceDims
,
alpha
,
beta
,
ws_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
in_elementwise_op_2
,
acc_elementwise_op_2
);
if
(
!
reduce2_ptr
->
IsSupportedArgument
(
argument2_ptr
.
get
()))
continue
;
std
::
string
reduce2_name
=
reduce2_ptr
->
GetTypeString
();
auto
invoker2_ptr
=
reduce2_ptr
->
MakeInvokerPointer
();
float
avg_time_2
=
invoker2_ptr
->
Run
(
argument2_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes_2
=
static_cast
<
size_t
>
(
inLengths2
[
0
])
*
inLengths2
[
1
]
*
sizeof
(
AccDataType
);
float
gb_per_sec
=
(
num_bytes
+
num_bytes_2
)
/
1.E6
/
(
avg_time
+
avg_time_2
);
std
::
cout
<<
"Perf: "
<<
(
avg_time
+
avg_time_2
)
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
" => "
<<
reduce2_name
<<
std
::
endl
;
if
(
gb_per_sec
>
best_gb_per_sec
)
{
best_avg_time
=
avg_time
+
avg_time_2
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
};
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_ref
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
}
if
(
do_dumpout
)
{
dumpBufferToFile
(
"dump_in.bin"
,
in
.
mData
.
data
(),
in
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out.bin"
,
out
.
mData
.
data
(),
out
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_out_host.bin"
,
out_ref
.
mData
.
data
(),
out_ref
.
mDesc
.
GetElementSize
());
if
(
NeedIndices
)
{
dumpBufferToFile
(
"dump_indices.bin"
,
out_indices
.
mData
.
data
(),
out_indices
.
mDesc
.
GetElementSize
());
dumpBufferToFile
(
"dump_indices_host.bin"
,
out_indices_ref
.
mData
.
data
(),
out_indices_ref
.
mDesc
.
GetElementSize
());
};
};
};
};
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
time_kernel
)
std
::
cout
<<
"Best Perf: "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"The requested reduction operation is not supported, please check !!!"
<<
std
::
endl
;
};
return
pass
;
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
>
void
profile_reduce_impl
(
bool
do_verification
,
bool
profile_reduce_impl
(
bool
do_verification
,
int
init_method
,
bool
do_log
,
bool
do_dumpout
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
,
bool
Propagat
eNan
,
bool
UseIndex
,
float
alpha
,
float
beta
)
{
bool
matched
=
false
;
bool
pass
=
true
;
using
tuple_of_description_instances
=
tensor_operation
::
device
::
device_reduce_instance
::
reduce_description_instances
;
...
...
@@ -651,29 +462,30 @@ void profile_reduce_impl(bool do_verification,
using
descType
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
>
(
tuple_object
))
>
;
if
(
!
description_match
(
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
NanOpt
,
IndicesOpt
))
descType
{},
inLengths
.
size
(),
reduceDims
,
ReduceOpId
,
PropagateNan
,
UseIndex
))
return
;
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
NanPropagation
>
(
descType
::
NanOpt_
),
static_cast
<
ReduceTensorIndices
>
(
descType
::
IndicesOpt_
)
>
(
do_verification
,
init_method
,
do_log
,
do_dumpout
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
pass
=
pass
&&
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
OutDataType
,
descType
::
Rank_
,
descType
::
NumReduceDim_
,
static_cast
<
ReduceTensorOp
>
(
descType
::
ReduceOpId_
),
static_cast
<
bool
>
(
descType
::
PropagateNan_
),
static_cast
<
bool
>
(
descType
::
UseIndex_
)
>
(
do_verification
,
init_method
,
do_dumpout
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
matched
=
true
;
});
return
pass
;
};
}
// namespace profiler
...
...
profiler/src/profile_batched_gemm.cpp
View file @
bb1f8082
...
...
@@ -396,5 +396,5 @@ int profile_batched_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_batched_gemm_reduce.cpp
View file @
bb1f8082
...
...
@@ -149,5 +149,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_bwd_weight.cpp
View file @
bb1f8082
...
...
@@ -142,5 +142,5 @@ int profile_conv_bwd_weight(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this Conv data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu.cpp
View file @
bb1f8082
...
...
@@ -110,5 +110,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu_add.cpp
View file @
bb1f8082
...
...
@@ -111,5 +111,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp
View file @
bb1f8082
...
...
@@ -112,5 +112,5 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! data_type & layout for this operator is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_convnd_fwd.cpp
View file @
bb1f8082
...
...
@@ -347,5 +347,5 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
std
::
to_string
(
num_dim_spatial
));
}
return
1
;
return
0
;
}
profiler/src/profile_gemm.cpp
View file @
bb1f8082
...
...
@@ -388,5 +388,5 @@ int profile_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_2d.cpp
View file @
bb1f8082
...
...
@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_relu.cpp
View file @
bb1f8082
...
...
@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_bias_relu_add.cpp
View file @
bb1f8082
...
...
@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_gemm_reduce.cpp
View file @
bb1f8082
...
...
@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_grouped_gemm.cpp
View file @
bb1f8082
...
...
@@ -153,5 +153,5 @@ int profile_grouped_gemm(int argc, char* argv[])
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
return
0
;
}
profiler/src/profile_reduce.cpp
View file @
bb1f8082
#include <iostream>
#include <fstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <getopt.h>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "data_type_enum.hpp"
#include "reduction_enums.hpp"
#include "host_common_util.hpp"
#include "profile_reduce_impl.hpp"
using
namespace
std
;
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorIndices
;
using
ck
::
ReduceTensorOp
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
...
...
@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{
"bf16"
,
no_argument
,
nullptr
,
'?'
},
{
"dumpout"
,
required_argument
,
nullptr
,
'o'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"log"
,
required_argument
,
nullptr
,
'l'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
template
<
typename
T
>
static
T
getSingleValueFromString
(
const
string
&
valueStr
)
{
std
::
istringstream
iss
(
valueStr
);
T
val
;
iss
>>
val
;
return
(
val
);
};
template
<
typename
T
>
static
std
::
vector
<
T
>
getTypeValuesFromString
(
const
char
*
cstr_values
)
{
std
::
string
valuesStr
(
cstr_values
);
std
::
vector
<
T
>
values
;
std
::
size_t
pos
=
0
;
std
::
size_t
new_pos
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
while
(
new_pos
!=
std
::
string
::
npos
)
{
const
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
,
new_pos
-
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
pos
=
new_pos
+
1
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
};
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
return
(
values
);
}
enum
struct
AppDataType
{
appHalf
=
0
,
appFloat
=
1
,
appInt32
=
2
,
appInt8
=
3
,
appInt8x4
=
4
,
appBFloat16
=
5
,
appDouble
=
6
,
};
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
reduceDims
)
{
for
(
auto
dim
:
reduceDims
)
...
...
@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
};
};
class
App
Args
class
ReduceProfiler
Args
{
private:
int
option_index
=
0
;
...
...
@@ -130,26 +68,23 @@ class AppArgs
std
::
vector
<
float
>
scales
;
ReduceTensorOp
reduceOp
=
ReduceTensorOp
::
ADD
;
App
DataType
compTypeId
=
App
DataType
::
app
Float
;
App
DataType
outTypeId
=
App
DataType
::
app
Float
;
ReduceTensorOp
reduceOp
=
ReduceTensorOp
::
ADD
;
ck
::
DataType
Enum
compTypeId
=
ck
::
DataType
Enum
::
Float
;
ck
::
DataType
Enum
outTypeId
=
ck
::
DataType
Enum
::
Float
;
bool
compType_assigned
=
false
;
bool
outType_assigned
=
false
;
NanPropagation
nanOpt
=
NanPropagation
::
NOT_PROPAGATE_NAN
;
ReduceTensorIndices
indicesOpt
=
ReduceTensorIndices
::
NO_INDICES
;
bool
do_log
=
false
;
bool
do_verification
=
false
;
bool
do_dumpout
=
false
;
int
nanOpt
=
0
;
int
indicesOpt
=
0
;
bool
do_verification
=
false
;
bool
do_dumpout
=
false
;
int
init_method
;
bool
time_kernel
;
bool
need_indices
=
false
;
AppArgs
()
=
default
;
~
AppArgs
()
=
default
;
ReduceProfilerArgs
()
=
default
;
~
ReduceProfilerArgs
()
=
default
;
void
show_usage
(
const
char
*
cmd
)
{
...
...
@@ -166,8 +101,11 @@ class AppArgs
std
::
cout
<<
"--outType or -W, optional enum value indicating the type of the reduced "
"output, which could be float when the input data is half"
<<
std
::
endl
;
std
::
cout
<<
"--nanOpt or -N, enum value indicates the selection for NanOpt"
<<
std
::
endl
;
std
::
cout
<<
"--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
std
::
cout
<<
"--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
<<
std
::
endl
;
std
::
cout
<<
"--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
"index in reduction"
<<
std
::
endl
;
std
::
cout
<<
"--scales or -S, comma separated two float values for alpha and beta"
<<
std
::
endl
;
...
...
@@ -181,18 +119,19 @@ class AppArgs
std
::
cout
<<
"--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
"for further analysis"
<<
std
::
endl
;
std
::
cout
<<
"--log or -l, 1/0 to indicate whether to log some information"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
{
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
optind
++
;
// to skip the "reduce" module name
while
(
1
)
{
ch
=
getopt_long
(
argc
,
argv
,
"D:R:O:C:W:N:I:S:v:o:
l:
"
,
long_options
,
&
option_index
);
ch
=
getopt_long
(
argc
,
argv
,
"D:R:O:C:W:N:I:S:v:o:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
break
;
switch
(
ch
)
...
...
@@ -219,27 +158,27 @@ class AppArgs
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
compTypeId
=
static_cast
<
App
DataType
>
(
std
::
atoi
(
optarg
));
compTypeId
=
static_cast
<
ck
::
DataType
Enum
>
(
std
::
atoi
(
optarg
));
compType_assigned
=
true
;
break
;
case
'W'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
outTypeId
=
static_cast
<
App
DataType
>
(
std
::
atoi
(
optarg
));
outTypeId
=
static_cast
<
ck
::
DataType
Enum
>
(
std
::
atoi
(
optarg
));
outType_assigned
=
true
;
break
;
case
'N'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
nanOpt
=
static_cast
<
NanPropagation
>
(
std
::
atoi
(
optarg
)
)
;
nanOpt
=
std
::
atoi
(
optarg
);
break
;
case
'I'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
indicesOpt
=
static_cast
<
ReduceTensorIndices
>
(
std
::
atoi
(
optarg
)
)
;
indicesOpt
=
std
::
atoi
(
optarg
);
break
;
case
'S'
:
if
(
!
optarg
)
...
...
@@ -262,12 +201,6 @@ class AppArgs
do_dumpout
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
break
;
case
'l'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
do_log
=
static_cast
<
bool
>
(
std
::
atoi
(
optarg
));
break
;
case
'?'
:
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"half"
)
use_half
=
true
;
...
...
@@ -295,7 +228,7 @@ class AppArgs
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
time_kernel
=
std
::
atoi
(
argv
[
optind
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
])
)
;
if
(
scales
.
empty
())
{
...
...
@@ -306,9 +239,6 @@ class AppArgs
if
(
reduceOp
==
ReduceTensorOp
::
MIN
||
reduceOp
==
ReduceTensorOp
::
MAX
||
reduceOp
==
ReduceTensorOp
::
AMAX
)
{
if
(
indicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
)
need_indices
=
true
;
// for indexable operations, no need to assign compType and outType, just let them be
// same as inType
compType_assigned
=
false
;
...
...
@@ -322,9 +252,10 @@ class AppArgs
int
profile_reduce
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
profiler
;
using
ck
::
DataTypeEnum
;
using
ck
::
profiler
::
profile_reduce_impl
;
App
Args
args
;
ReduceProfiler
Args
args
;
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
return
(
-
1
);
...
...
@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
if
(
args
.
use_half
)
{
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
App
DataType
::
app
Half
;
args
.
compTypeId
=
DataType
Enum
::
Half
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
Half
&&
args
.
outTypeId
!=
App
DataType
::
app
Float
))
args
.
outTypeId
=
App
DataType
::
app
Float
;
(
args
.
outTypeId
!=
DataType
Enum
::
Half
&&
args
.
outTypeId
!=
DataType
Enum
::
Float
))
args
.
outTypeId
=
DataType
Enum
::
Float
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
Half
;
args
.
outTypeId
=
DataType
Enum
::
Half
;
if
(
args
.
compTypeId
==
App
DataType
::
app
Half
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Half
)
{
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Float
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Float
)
{
profile_reduce_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
...
...
@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
{
profile_reduce_impl
<
double
,
double
,
double
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
use_int8
)
{
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
App
DataType
::
app
Int8
;
args
.
compTypeId
=
DataType
Enum
::
Int8
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
Int8
&&
args
.
outTypeId
!=
App
DataType
::
app
Int32
))
args
.
outTypeId
=
App
DataType
::
app
Int32
;
(
args
.
outTypeId
!=
DataType
Enum
::
Int8
&&
args
.
outTypeId
!=
DataType
Enum
::
Int32
))
args
.
outTypeId
=
DataType
Enum
::
Int32
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
Int8
;
args
.
outTypeId
=
DataType
Enum
::
Int8
;
if
(
args
.
compTypeId
==
App
DataType
::
app
Int8
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Int8
)
{
profile_reduce_impl
<
int8_t
,
int8_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Int32
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Int32
)
{
profile_reduce_impl
<
int8_t
,
int32_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
...
...
@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
else
if
(
args
.
use_bf16
)
{
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
App
DataType
::
app
BFloat16
&&
args
.
outTypeId
!=
App
DataType
::
app
Float
))
args
.
outTypeId
=
App
DataType
::
app
Float
;
(
args
.
outTypeId
!=
DataType
Enum
::
BFloat16
&&
args
.
outTypeId
!=
DataType
Enum
::
Float
))
args
.
outTypeId
=
DataType
Enum
::
Float
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
App
DataType
::
app
BFloat16
;
args
.
outTypeId
=
DataType
Enum
::
BFloat16
;
profile_reduce_impl
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
{
if
(
args
.
compTypeId
==
App
DataType
::
app
Float
)
if
(
args
.
compTypeId
==
DataType
Enum
::
Float
)
{
profile_reduce_impl
<
float
,
float
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
App
DataType
::
app
Double
)
else
if
(
args
.
compTypeId
==
DataType
Enum
::
Double
)
{
profile_reduce_impl
<
float
,
double
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
time_kernel
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
static_cast
<
bool
>
(
args
.
nanOpt
)
,
static_cast
<
bool
>
(
args
.
indicesOpt
)
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
...
...
profiler/src/profiler.cpp
View file @
bb1f8082
...
...
@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_atomic_add
(
int
,
char
*
[]);
...
...
@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
}
else
if
(
strcmp
(
argv
[
1
],
"grouped_gemm"
)
==
0
)
{
profile_grouped_gemm
(
argc
,
argv
);
return
profile_grouped_gemm
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
{
...
...
@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
" conv1d_bwd_data: BackwardConvolution data 1 dim
\n
"
" conv2d_bwd_data: BackwardConvolution data 2 dim
\n
"
" conv3d_bwd_data: BackwardConvolution data 3 dim
\n
"
" reduce: R
EDUCE
\n
"
" reduce: R
educe
\n
"
" conv2d_bwd_weight: Backward Weight Convolution 2d
\n
"
);
// clang-format on
}
...
...
Prev
1
…
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment