Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
05fd7ff8
Commit
05fd7ff8
authored
Jan 30, 2024
by
Jakub Piasecki
Browse files
Merge remote-tracking branch 'origin/develop' into gemm_f16_int8
parents
2784b516
84832fc4
Changes
81
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1099 additions
and
114 deletions
+1099
-114
library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
.../device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+73
-8
library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
.../device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+82
-0
library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f16_instance.cpp
...a_beta/device_layernorm2d_bwd_gamma_beta_f16_instance.cpp
+1
-1
library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f32_instance.cpp
...a_beta/device_layernorm2d_bwd_gamma_beta_f32_instance.cpp
+1
-1
profiler/include/profiler/profile_gemm_impl.hpp
profiler/include/profiler/profile_gemm_impl.hpp
+1
-1
profiler/include/profiler/profile_gemm_splitk_impl.hpp
profiler/include/profiler/profile_gemm_splitk_impl.hpp
+1
-1
profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp
...nclude/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp
+261
-0
profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
...nclude/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
+263
-0
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+3
-0
profiler/src/profile_gemm.cpp
profiler/src/profile_gemm.cpp
+9
-3
profiler/src/profile_groupnorm_bwd_gamma_beta.cpp
profiler/src/profile_groupnorm_bwd_gamma_beta.cpp
+104
-0
profiler/src/profile_layernorm_bwd_gamma_beta.cpp
profiler/src/profile_layernorm_bwd_gamma_beta.cpp
+112
-0
script/clang-format-overwrite.sh
script/clang-format-overwrite.sh
+1
-1
test/CMakeLists.txt
test/CMakeLists.txt
+1
-0
test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
...tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
+2
-0
test/normalization_bwd_gamma_beta/CMakeLists.txt
test/normalization_bwd_gamma_beta/CMakeLists.txt
+13
-0
test/normalization_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp
...ion_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp
+51
-0
test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
...n_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
+48
-0
test/wrapper/test_copy.cpp
test/wrapper/test_copy.cpp
+40
-39
test/wrapper/test_partition.cpp
test/wrapper/test_partition.cpp
+32
-59
No files found.
library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
View file @
05fd7ff8
...
...
@@ -27,6 +27,7 @@ using S = ck::Sequence<Is...>;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
KPadding
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
...
...
@@ -110,17 +111,39 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
// clang-format on
>
;
template
<
ck
::
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
>
template
<
ck
::
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
,
ck
::
PipelineVersion
PipVer
,
ck
::
LoopScheduler
LoopSche
>
using
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
=
std
::
tuple
<
// clang-format off
//#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#########################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
//#########################| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl|
//#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
32
,
128
,
4
,
8
,
32
,
32
,
1
,
2
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipelineVersion
::
v1
,
LoopScheduler
::
Interwave
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
128
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipelineVersion
::
v1
,
LoopScheduler
::
Interwave
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
256
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipelineVersion
::
v1
,
LoopScheduler
::
Interwave
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
16
,
256
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipelineVersion
::
v1
,
LoopScheduler
::
Interwave
>
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
128
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
256
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
16
,
256
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
16
,
512
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
128
,
16
,
4
,
8
,
16
,
16
,
4
,
1
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
256
,
16
,
4
,
8
,
16
,
16
,
8
,
1
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
256
,
16
,
4
,
8
,
16
,
16
,
4
,
1
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
512
,
16
,
4
,
8
,
16
,
16
,
8
,
1
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
16
,
16
,
8
,
8
,
16
,
16
,
1
,
1
,
S
<
1
,
8
,
8
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
16
,
16
,
16
,
8
,
16
,
16
,
1
,
1
,
S
<
1
,
16
,
4
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
32
,
8
,
8
,
16
,
16
,
1
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
64
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
2
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
128
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
256
,
8
,
8
,
16
,
16
,
1
,
8
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
16
,
256
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
4
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
32
,
16
,
8
,
8
,
16
,
16
,
1
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
64
,
16
,
8
,
8
,
16
,
16
,
2
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
128
,
16
,
8
,
8
,
16
,
16
,
4
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
256
,
16
,
8
,
8
,
16
,
16
,
8
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
256
,
16
,
8
,
8
,
16
,
16
,
4
,
1
,
S
<
1
,
8
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
2
,
1
,
8
,
true
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
// clang-format on
>
;
...
...
@@ -141,9 +164,51 @@ void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances
<
GemmMNKPadding
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmMNKPadding
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmDefault
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmDefault
,
ck
::
PipelineVersion
::
v2
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmDefault
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Interwave
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmKPadding
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmKPadding
,
ck
::
PipelineVersion
::
v2
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmKPadding
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Interwave
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmMNKPadding
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmMNKPadding
,
ck
::
PipelineVersion
::
v2
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
<
GemmMNKPadding
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Interwave
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
View file @
05fd7ff8
...
...
@@ -27,6 +27,7 @@ using S = ck::Sequence<Is...>;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
KPadding
;
static
constexpr
auto
GemmMNPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
...
...
@@ -95,6 +96,41 @@ using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple<
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
32
,
64
,
4
,
8
,
32
,
32
,
1
,
2
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
1
,
3
,
2
>
,
S
<
0
,
1
,
3
,
2
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
8
,
F16
,
PipelineVersion
::
v2
>
// clang-format on
>
;
template
<
ck
::
tensor_operation
::
device
::
GemmSpecialization
GemmSpec
,
ck
::
PipelineVersion
PipVer
,
ck
::
LoopScheduler
LoopSche
>
using
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
=
std
::
tuple
<
// clang-format off
//#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#########################| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Specialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
//#########################| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl| _NWaveNPerXdl|
//#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
128
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
256
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
16
,
256
,
4
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
16
,
512
,
4
,
8
,
16
,
16
,
1
,
8
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
128
,
16
,
4
,
8
,
16
,
16
,
4
,
1
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
256
,
16
,
4
,
8
,
16
,
16
,
8
,
1
,
S
<
1
,
4
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
256
,
16
,
4
,
8
,
16
,
16
,
4
,
1
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
512
,
16
,
4
,
8
,
16
,
16
,
8
,
1
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
4
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
16
,
16
,
8
,
8
,
16
,
16
,
1
,
1
,
S
<
1
,
8
,
8
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
8
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
64
,
16
,
16
,
16
,
8
,
16
,
16
,
1
,
1
,
S
<
1
,
16
,
4
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
16
,
4
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
32
,
8
,
8
,
16
,
16
,
1
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
64
,
8
,
8
,
16
,
16
,
1
,
2
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
128
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
16
,
256
,
8
,
8
,
16
,
16
,
1
,
8
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
16
,
256
,
8
,
8
,
16
,
16
,
1
,
4
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
32
,
16
,
8
,
8
,
16
,
16
,
1
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
64
,
16
,
8
,
8
,
16
,
16
,
2
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
128
,
16
,
8
,
8
,
16
,
16
,
4
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
128
,
256
,
16
,
8
,
8
,
16
,
16
,
8
,
1
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
,
DeviceGemmXdlSplitKCShuffle
<
F16
,
F16
,
F16
,
F32
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
GemmSpec
,
256
,
256
,
16
,
8
,
8
,
16
,
16
,
4
,
1
,
S
<
1
,
8
,
32
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
S
<
1
,
8
,
16
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
64
,
1
,
4
>
,
4
,
F16
,
PipVer
,
LoopSche
>
// clang-format on
>
;
void
add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
...
...
@@ -112,6 +148,52 @@ void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances
<
GemmMNKPadding
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmDefault
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmDefault
,
ck
::
PipelineVersion
::
v2
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmDefault
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Interwave
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmKPadding
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmKPadding
,
ck
::
PipelineVersion
::
v2
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmKPadding
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Interwave
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmMNKPadding
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmMNKPadding
,
ck
::
PipelineVersion
::
v2
,
ck
::
LoopScheduler
::
Default
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instances
<
GemmMNKPadding
,
ck
::
PipelineVersion
::
v1
,
ck
::
LoopScheduler
::
Interwave
>
{});
}
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f16_instance.cpp
View file @
05fd7ff8
...
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
void
add_device_layernorm2d_bwd_gamma_beta_
rank_2_1_
f16_instances
(
void
add_device_layernorm2d_bwd_gamma_beta_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalizationBwdGammaBeta
<
F16
,
F16
,
F16
,
F16
,
F16
,
2
,
1
>>>&
instances
)
{
...
...
library/src/tensor_operation_instance/gpu/normalization_bwd_gamma_beta/device_layernorm2d_bwd_gamma_beta_f32_instance.cpp
View file @
05fd7ff8
...
...
@@ -8,7 +8,7 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
void
add_device_layernorm2d_bwd_gamma_beta_
rank_2_1_
f32_instances
(
void
add_device_layernorm2d_bwd_gamma_beta_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalizationBwdGammaBeta
<
F32
,
F32
,
F32
,
F32
,
F32
,
2
,
1
>>>&
instances
)
{
...
...
profiler/include/profiler/profile_gemm_impl.hpp
View file @
05fd7ff8
...
...
@@ -298,7 +298,7 @@ int profile_gemm_impl(int do_verification,
}
}
return
pass
?
0
:
1
;
return
pass
;
}
}
// namespace profiler
...
...
profiler/include/profiler/profile_gemm_splitk_impl.hpp
View file @
05fd7ff8
...
...
@@ -145,7 +145,7 @@ bool profile_gemm_splitk_impl(int do_verification,
// profile device GEMM instances
for
(
auto
&
op_ptr
:
op_ptrs
)
{
std
::
vector
<
int
>
kbatch_list
=
{
1
,
2
,
4
,
8
,
12
,
16
,
20
,
32
,
3
6
,
40
,
64
,
96
,
12
8
};
std
::
vector
<
int
>
kbatch_list
=
{
1
,
2
,
4
,
8
,
12
,
16
,
19
,
20
,
32
,
38
};
if
(
KBatch
>
0
)
{
...
...
profiler/include/profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp
0 → 100644
View file @
05fd7ff8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_gamma_beta.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
DYDataType
,
typename
XDataType
,
typename
MeanInvStdDataType
,
typename
ComputeDataType
,
typename
DGammaDataType
,
typename
DBetaDataType
>
bool
profile_groupnorm_bwd_gamma_beta_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
length
)
{
// we don't need GammaDataType and DXDataType here, just for reference class
using
GammaDataType
=
DYDataType
;
using
DXDataType
=
DYDataType
;
if
(
length
.
size
()
!=
5
)
return
false
;
index_t
N
=
length
[
0
];
index_t
G
=
length
[
3
];
index_t
C
=
length
[
4
];
std
::
vector
<
index_t
>
reduce_dim
=
{
0
,
1
,
2
};
std
::
vector
<
index_t
>
gamma_beta_length
=
{
G
,
C
};
Tensor
<
DYDataType
>
dy
(
length
);
Tensor
<
XDataType
>
x
(
length
);
Tensor
<
GammaDataType
>
gamma
(
gamma_beta_length
);
// dummy tensor, for reference
Tensor
<
MeanInvStdDataType
>
mean
({
N
,
G
});
Tensor
<
MeanInvStdDataType
>
inv_std
({
N
,
G
});
Tensor
<
DGammaDataType
>
dgamma
(
gamma_beta_length
);
Tensor
<
DBetaDataType
>
dbeta
(
gamma_beta_length
);
Tensor
<
DXDataType
>
host_dx
(
length
);
// dummy tensor, for reference
Tensor
<
DGammaDataType
>
host_dgamma
(
gamma_beta_length
);
Tensor
<
DBetaDataType
>
host_dbeta
(
gamma_beta_length
);
std
::
vector
<
index_t
>
strideDy
=
std
::
vector
<
ck
::
index_t
>
{
dy
.
mDesc
.
GetStrides
().
begin
(),
dy
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideX
=
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideDGamma
{
dgamma
.
mDesc
.
GetStrides
().
begin
(),
dgamma
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideDBeta
{
dbeta
.
mDesc
.
GetStrides
().
begin
(),
dbeta
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideMeanInvStd
=
{
G
,
0
,
0
,
1
,
0
};
switch
(
init_method
)
{
case
0
:
dy
.
GenerateTensorValue
(
GeneratorTensor_1
<
DYDataType
>
{});
x
.
GenerateTensorValue
(
GeneratorTensor_1
<
XDataType
>
{});
mean
.
GenerateTensorValue
(
GeneratorTensor_1
<
MeanInvStdDataType
>
{});
inv_std
.
GenerateTensorValue
(
GeneratorTensor_1
<
MeanInvStdDataType
>
{});
dgamma
.
GenerateTensorValue
(
GeneratorTensor_1
<
DGammaDataType
>
{});
dbeta
.
GenerateTensorValue
(
GeneratorTensor_1
<
DBetaDataType
>
{});
break
;
case
1
:
dy
.
GenerateTensorValue
(
GeneratorTensor_2
<
DYDataType
>
{
-
5
,
5
});
x
.
GenerateTensorValue
(
GeneratorTensor_2
<
XDataType
>
{
-
5
,
5
});
mean
.
GenerateTensorValue
(
GeneratorTensor_2
<
MeanInvStdDataType
>
{
-
5
,
5
});
inv_std
.
GenerateTensorValue
(
GeneratorTensor_2
<
MeanInvStdDataType
>
{
0
,
5
});
dgamma
.
GenerateTensorValue
(
GeneratorTensor_2
<
DGammaDataType
>
{
-
5
,
5
});
dbeta
.
GenerateTensorValue
(
GeneratorTensor_2
<
DBetaDataType
>
{
-
5
,
5
});
break
;
default:
dy
.
GenerateTensorValue
(
GeneratorTensor_3
<
DYDataType
>
{
0
,
1
});
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
0
,
1
});
mean
.
GenerateTensorValue
(
GeneratorTensor_3
<
MeanInvStdDataType
>
{
-
0.5
,
0.5
});
inv_std
.
GenerateTensorValue
(
GeneratorTensor_3
<
MeanInvStdDataType
>
{
0
,
0.5
});
dgamma
.
GenerateTensorValue
(
GeneratorTensor_3
<
DGammaDataType
>
{
-
0.5
,
0.5
});
dbeta
.
GenerateTensorValue
(
GeneratorTensor_3
<
DBetaDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
dy_dev
(
sizeof
(
DYDataType
)
*
dy
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
mean_dev
(
sizeof
(
MeanInvStdDataType
)
*
mean
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
inv_std_dev
(
sizeof
(
MeanInvStdDataType
)
*
inv_std
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dgamma_dev
(
sizeof
(
DGammaDataType
)
*
dgamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dbeta_dev
(
sizeof
(
DBetaDataType
)
*
dbeta
.
mDesc
.
GetElementSpaceSize
());
dy_dev
.
ToDevice
(
dy
.
mData
.
data
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
mean_dev
.
ToDevice
(
mean
.
mData
.
data
());
inv_std_dev
.
ToDevice
(
inv_std
.
mData
.
data
());
// add device normalization instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationBwdGammaBeta
<
DYDataType
,
XDataType
,
MeanInvStdDataType
,
DGammaDataType
,
DBetaDataType
,
5
,
3
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGroupnormBwd
<
DYDataType
,
XDataType
,
GammaDataType
,
MeanInvStdDataType
,
DGammaDataType
,
DBetaDataType
,
DXDataType
,
ComputeDataType
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
dy
,
x
,
gamma
,
mean
,
inv_std
,
host_dgamma
,
host_dbeta
,
host_dx
,
length
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
std
::
size_t
num_bytes
=
dy
.
mDesc
.
GetElementSize
()
*
sizeof
(
DYDataType
)
+
x
.
mDesc
.
GetElementSize
()
*
sizeof
(
XDataType
)
+
mean
.
mDesc
.
GetElementSize
()
*
sizeof
(
MeanInvStdDataType
)
+
inv_std
.
mDesc
.
GetElementSize
()
*
sizeof
(
MeanInvStdDataType
)
+
dgamma
.
mDesc
.
GetElementSize
()
*
sizeof
(
DGammaDataType
)
+
dbeta
.
mDesc
.
GetElementSize
()
*
sizeof
(
DBetaDataType
);
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
strideDy
,
strideX
,
strideMeanInvStd
,
strideMeanInvStd
,
gamma_beta_length
,
strideDGamma
,
strideDBeta
,
reduce_dim
,
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dgamma_dev
.
GetDeviceBuffer
(),
dbeta_dev
.
GetDeviceBuffer
());
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
++
num_kernel
;
}
else
{
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"input lengths = "
,
length
,
", "
)
<<
std
::
endl
;
}
continue
;
}
size_t
workspace_sz
=
inst_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
DeviceMem
workspace_dev
(
workspace_sz
);
inst_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
dgamma_dev
.
FromDevice
(
dgamma
.
mData
.
data
());
dbeta_dev
.
FromDevice
(
dbeta
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
dgamma
,
host_dgamma
,
"Error: Incorrect dgamma"
,
1e-3
,
1e-3
);
pass
&=
ck
::
utils
::
check_err
(
dbeta
,
host_dbeta
,
"Error: Incorrect dbeta"
,
1e-3
,
1e-3
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"dy : "
,
dy
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host_dgamma : "
,
host_dgamma
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"dgamma : "
,
dgamma
.
mData
,
","
)
<<
std
::
endl
;
}
if
(
!
pass
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" failed verification: "
;
LogRange
(
std
::
cout
<<
"lengths = ["
,
length
,
", "
)
<<
"]."
<<
std
::
endl
;
return
false
;
}
else
{
if
(
time_kernel
)
std
::
cout
<<
"pass"
<<
std
::
endl
;
}
}
}
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"reduce dims "
,
reduce_dim
,
","
)
<<
std
::
endl
;
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s,"
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
true
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profiler/profile_layernorm_bwd_gamma_beta_impl.hpp
0 → 100644
View file @
05fd7ff8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm_bwd_gamma_beta.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
DYDataType
,
typename
XDataType
,
typename
MeanInvStdDataType
,
typename
ComputeDataType
,
typename
DGammaDataType
,
typename
DBetaDataType
,
index_t
Rank
>
bool
profile_layernorm_bwd_gamma_beta_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
length
)
{
// we don't need GammaDataType and DXDataType here, just for reference class
using
GammaDataType
=
DYDataType
;
using
DXDataType
=
DYDataType
;
if
(
length
.
size
()
!=
Rank
||
Rank
<
2
)
return
false
;
// Assume normalize dimension for first dimension
// Layernorm 2D, input = [M, K], reduce on M axis
// Layernorm 4D, input = [N, H, W, C], redice on N axis
constexpr
int
NumReduceDim
=
Rank
-
1
;
std
::
vector
<
index_t
>
reduce_dim
=
{
0
};
std
::
vector
<
index_t
>
invarient_length
{
length
.
begin
()
+
1
,
length
.
end
()};
Tensor
<
DYDataType
>
dy
(
length
);
Tensor
<
XDataType
>
x
(
length
);
Tensor
<
GammaDataType
>
gamma
(
invarient_length
);
// dummy tensor, for reference
Tensor
<
MeanInvStdDataType
>
mean
({
length
[
0
]});
Tensor
<
MeanInvStdDataType
>
inv_std
({
length
[
0
]});
Tensor
<
DGammaDataType
>
dgamma
(
invarient_length
);
Tensor
<
DBetaDataType
>
dbeta
(
invarient_length
);
Tensor
<
DXDataType
>
host_dx
(
length
);
// dummy tensor, for reference
Tensor
<
DGammaDataType
>
host_dgamma
(
invarient_length
);
Tensor
<
DBetaDataType
>
host_dbeta
(
invarient_length
);
std
::
vector
<
index_t
>
strideDy
=
std
::
vector
<
ck
::
index_t
>
{
dy
.
mDesc
.
GetStrides
().
begin
(),
dy
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideX
=
strideDy
;
std
::
vector
<
index_t
>
strideDGamma
{
dgamma
.
mDesc
.
GetStrides
().
begin
(),
dgamma
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideDBeta
{
dbeta
.
mDesc
.
GetStrides
().
begin
(),
dbeta
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideMeanInvStd
{
Rank
,
0
};
strideMeanInvStd
[
0
]
=
1
;
switch
(
init_method
)
{
case
0
:
dy
.
GenerateTensorValue
(
GeneratorTensor_1
<
DYDataType
>
{});
x
.
GenerateTensorValue
(
GeneratorTensor_1
<
XDataType
>
{});
mean
.
GenerateTensorValue
(
GeneratorTensor_1
<
MeanInvStdDataType
>
{});
inv_std
.
GenerateTensorValue
(
GeneratorTensor_1
<
MeanInvStdDataType
>
{});
dgamma
.
GenerateTensorValue
(
GeneratorTensor_1
<
DGammaDataType
>
{});
dbeta
.
GenerateTensorValue
(
GeneratorTensor_1
<
DBetaDataType
>
{});
break
;
case
1
:
dy
.
GenerateTensorValue
(
GeneratorTensor_2
<
DYDataType
>
{
-
5
,
5
});
x
.
GenerateTensorValue
(
GeneratorTensor_2
<
XDataType
>
{
-
5
,
5
});
mean
.
GenerateTensorValue
(
GeneratorTensor_2
<
MeanInvStdDataType
>
{
-
5
,
5
});
inv_std
.
GenerateTensorValue
(
GeneratorTensor_2
<
MeanInvStdDataType
>
{
0
,
5
});
dgamma
.
GenerateTensorValue
(
GeneratorTensor_2
<
DGammaDataType
>
{
-
5
,
5
});
dbeta
.
GenerateTensorValue
(
GeneratorTensor_2
<
DBetaDataType
>
{
-
5
,
5
});
break
;
default:
dy
.
GenerateTensorValue
(
GeneratorTensor_3
<
DYDataType
>
{
0
,
1
});
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
0
,
1
});
mean
.
GenerateTensorValue
(
GeneratorTensor_3
<
MeanInvStdDataType
>
{
-
0.5
,
0.5
});
inv_std
.
GenerateTensorValue
(
GeneratorTensor_3
<
MeanInvStdDataType
>
{
0
,
0.5
});
dgamma
.
GenerateTensorValue
(
GeneratorTensor_3
<
DGammaDataType
>
{
-
0.5
,
0.5
});
dbeta
.
GenerateTensorValue
(
GeneratorTensor_3
<
DBetaDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
dy_dev
(
sizeof
(
DYDataType
)
*
dy
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
mean_dev
(
sizeof
(
MeanInvStdDataType
)
*
mean
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
inv_std_dev
(
sizeof
(
MeanInvStdDataType
)
*
inv_std
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dgamma_dev
(
sizeof
(
DGammaDataType
)
*
dgamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dbeta_dev
(
sizeof
(
DBetaDataType
)
*
dbeta
.
mDesc
.
GetElementSpaceSize
());
dy_dev
.
ToDevice
(
dy
.
mData
.
data
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
mean_dev
.
ToDevice
(
mean
.
mData
.
data
());
inv_std_dev
.
ToDevice
(
inv_std
.
mData
.
data
());
// add device normalization instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalizationBwdGammaBeta
<
DYDataType
,
XDataType
,
MeanInvStdDataType
,
DGammaDataType
,
DBetaDataType
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceLayernormBwd
<
DYDataType
,
XDataType
,
GammaDataType
,
MeanInvStdDataType
,
DGammaDataType
,
DBetaDataType
,
DXDataType
,
ComputeDataType
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
dy
,
x
,
gamma
,
mean
,
inv_std
,
host_dgamma
,
host_dbeta
,
host_dx
,
length
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
std
::
size_t
num_bytes
=
dy
.
mDesc
.
GetElementSize
()
*
sizeof
(
DYDataType
)
+
x
.
mDesc
.
GetElementSize
()
*
sizeof
(
XDataType
)
+
mean
.
mDesc
.
GetElementSize
()
*
sizeof
(
MeanInvStdDataType
)
+
inv_std
.
mDesc
.
GetElementSize
()
*
sizeof
(
MeanInvStdDataType
)
+
dgamma
.
mDesc
.
GetElementSize
()
*
sizeof
(
DGammaDataType
)
+
dbeta
.
mDesc
.
GetElementSize
()
*
sizeof
(
DBetaDataType
);
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
strideDy
,
strideX
,
strideMeanInvStd
,
strideMeanInvStd
,
invarient_length
,
strideDGamma
,
strideDBeta
,
reduce_dim
,
dy_dev
.
GetDeviceBuffer
(),
x_dev
.
GetDeviceBuffer
(),
mean_dev
.
GetDeviceBuffer
(),
inv_std_dev
.
GetDeviceBuffer
(),
dgamma_dev
.
GetDeviceBuffer
(),
dbeta_dev
.
GetDeviceBuffer
());
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
++
num_kernel
;
}
else
{
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"input lengths = "
,
length
,
", "
)
<<
std
::
endl
;
}
continue
;
}
size_t
workspace_sz
=
inst_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
());
DeviceMem
workspace_dev
(
workspace_sz
);
inst_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
dgamma_dev
.
FromDevice
(
dgamma
.
mData
.
data
());
dbeta_dev
.
FromDevice
(
dbeta
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
dgamma
,
host_dgamma
,
"Error: Incorrect dgamma"
,
1e-3
,
1e-3
);
pass
&=
ck
::
utils
::
check_err
(
dbeta
,
host_dbeta
,
"Error: Incorrect dbeta"
,
1e-3
,
1e-3
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"dy : "
,
dy
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host_dgamma : "
,
host_dgamma
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"dgamma : "
,
dgamma
.
mData
,
","
)
<<
std
::
endl
;
}
if
(
!
pass
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" failed verification: "
;
LogRange
(
std
::
cout
<<
"lengths = ["
,
length
,
", "
)
<<
"]."
<<
std
::
endl
;
return
false
;
}
else
{
if
(
time_kernel
)
std
::
cout
<<
"pass"
<<
std
::
endl
;
}
}
}
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"reduce dims "
,
reduce_dim
,
","
)
<<
std
::
endl
;
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s,"
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
true
;
}
}
// namespace profiler
}
// namespace ck
profiler/src/CMakeLists.txt
View file @
05fd7ff8
...
...
@@ -19,6 +19,8 @@ set(PROFILER_SOURCES
profile_groupnorm_bwd_data.cpp
profile_groupnorm_fwd.cpp
profile_layernorm_bwd_data.cpp
profile_layernorm_bwd_gamma_beta.cpp
profile_groupnorm_bwd_gamma_beta.cpp
profile_layernorm_fwd.cpp
profile_max_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp
...
...
@@ -85,6 +87,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_gamma_beta_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_softmax_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batchnorm_instance
)
...
...
profiler/src/profile_gemm.cpp
View file @
05fd7ff8
...
...
@@ -137,8 +137,14 @@ int profile_gemm(int argc, char* argv[])
return
pass
?
0
:
1
;
};
if
(
false
)
;
if
(
data_type
!=
GemmDataType
::
F32_F32_F32
&&
data_type
!=
GemmDataType
::
F16_F16_F16
&&
data_type
!=
GemmDataType
::
BF16_BF16_BF16
&&
data_type
!=
GemmDataType
::
INT8_INT8_INT8
&&
data_type
!=
GemmDataType
::
F8_F8_F8
)
{
// dummy clause before the else clauses for different data types
std
::
cout
<<
"Gemm: this data_type is not implemented"
<<
std
::
endl
;
return
1
;
}
#ifdef CK_ENABLE_FP32
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
...
...
@@ -231,7 +237,7 @@ int profile_gemm(int argc, char* argv[])
#endif
else
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
std
::
cout
<<
"
Gemm:
this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
...
...
profiler/src/profile_groupnorm_bwd_gamma_beta.cpp
0 → 100644
View file @
05fd7ff8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp"
#include "profiler_operation_registry.hpp"
using
ck
::
index_t
;
struct
groupnormBwdGammaBetaArgParser
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
long_opts
=
{{
"length"
,
{}}};
bool
parse_opt
(
int
argc
,
char
*
argv
[],
const
std
::
string
&
key
,
int
i
)
{
if
(
std
::
string
(
"--"
)
+
key
==
argv
[
i
])
{
int
pos
=
i
;
while
(
++
i
<
argc
&&
argv
[
i
][
0
]
!=
'-'
)
{}
int
end
=
i
;
for
(
int
j
=
pos
+
1
;
j
<
end
;
j
++
)
{
long_opts
[
key
].
push_back
(
std
::
stoi
(
argv
[
j
]));
}
return
true
;
}
return
false
;
}
void
operator
()(
int
argc
,
char
*
argv
[])
{
for
(
auto
&
kv
:
long_opts
)
{
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
if
(
parse_opt
(
argc
,
argv
,
kv
.
first
,
i
))
break
;
}
}
}
};
void
print_help_groupnorm_bwd_gamma_beta
()
{
// eg: ckProfiler groupnorm_bwd_gamma_beta 1 0 2 0 1 --length 1 16 16 32 40
std
::
cout
<<
"arg1: data type (0: fp16; 1: fp32)
\n
"
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
<<
"arg5: time kernel (0=no, 1=yes)
\n
"
<<
"--length: tensor extents (e.g, --length 1 16 16 32 40)
\n
"
<<
std
::
endl
;
}
int
profile_groupnorm_bwd_gamma_beta
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<=
2
)
{
print_help_groupnorm_bwd_gamma_beta
();
return
0
;
}
groupnormBwdGammaBetaArgParser
arg_parser
;
// short unnamed options
const
ck
::
DataTypeEnum
data_type
=
static_cast
<
ck
::
DataTypeEnum
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
6
]);
// parse the long options
arg_parser
(
argc
,
argv
);
const
std
::
vector
<
index_t
>
length
=
arg_parser
.
long_opts
[
"length"
];
using
F32
=
float
;
if
(
length
.
size
()
==
5
)
{
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
ck
::
profiler
::
profile_groupnorm_bwd_gamma_beta_impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
}
}
else
{
throw
std
::
runtime_error
(
"length should be 5"
);
}
return
0
;
}
REGISTER_PROFILER_OPERATION
(
"groupnorm_bwd_gamma_beta"
,
"Group Normalization"
,
profile_groupnorm_bwd_gamma_beta
);
profiler/src/profile_layernorm_bwd_gamma_beta.cpp
0 → 100644
View file @
05fd7ff8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/data_type_enum.hpp"
#include "profiler/profile_layernorm_bwd_gamma_beta_impl.hpp"
#include "profiler_operation_registry.hpp"
using
ck
::
index_t
;
struct
layernormBwdGammaBetaArgParser
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
long_opts
=
{{
"length"
,
{}}};
bool
parse_opt
(
int
argc
,
char
*
argv
[],
const
std
::
string
&
key
,
int
i
)
{
if
(
std
::
string
(
"--"
)
+
key
==
argv
[
i
])
{
int
pos
=
i
;
while
(
++
i
<
argc
&&
argv
[
i
][
0
]
!=
'-'
)
{}
int
end
=
i
;
for
(
int
j
=
pos
+
1
;
j
<
end
;
j
++
)
{
long_opts
[
key
].
push_back
(
std
::
stoi
(
argv
[
j
]));
}
return
true
;
}
return
false
;
}
void
operator
()(
int
argc
,
char
*
argv
[])
{
for
(
auto
&
kv
:
long_opts
)
{
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
if
(
parse_opt
(
argc
,
argv
,
kv
.
first
,
i
))
break
;
}
}
}
};
void
print_help_layernorm_bwd_gamma_beta
()
{
// eg: ckProfiler layernorm_bwd_gamma_beta 0 0 2 0 1 --length 1502 4096
std
::
cout
<<
"arg1: data type (0: fp16; 1: fp32)
\n
"
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
<<
"arg5: time kernel (0=no, 1=yes)
\n
"
<<
"--length: tensor extents (e.g, --length 1024 1024)
\n
"
<<
std
::
endl
;
}
int
profile_layernorm_bwd_gamma_beta
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<=
2
)
{
print_help_layernorm_bwd_gamma_beta
();
return
0
;
}
layernormBwdGammaBetaArgParser
arg_parser
;
// short unnamed options
const
ck
::
DataTypeEnum
data_type
=
static_cast
<
ck
::
DataTypeEnum
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
6
]);
// parse the long options
arg_parser
(
argc
,
argv
);
const
std
::
vector
<
index_t
>
length
=
arg_parser
.
long_opts
[
"length"
];
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
if
(
length
.
size
()
==
2
)
{
constexpr
int
rank
=
2
;
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
ck
::
profiler
::
profile_layernorm_bwd_gamma_beta_impl
<
F16
,
F16
,
F16
,
F32
,
F16
,
F16
,
rank
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
ck
::
profiler
::
profile_layernorm_bwd_gamma_beta_impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
,
rank
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
}
}
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
}
return
0
;
}
REGISTER_PROFILER_OPERATION
(
"layernorm_bwd_gamma_beta"
,
"Layer Normalization"
,
profile_layernorm_bwd_gamma_beta
);
script/clang-format-overwrite.sh
View file @
05fd7ff8
#
find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
find
.
-name
deps
-prune
-o
-name
build
-prune
-o
-iname
'*.h'
-o
-iname
'*.hpp'
-o
-iname
'*.cpp'
-o
-iname
'*.h.in'
-o
-iname
'*.hpp.in'
-o
-iname
'*.cpp.in'
-o
-iname
'*.cl'
-o
-iname
'*.cuh'
-o
-iname
'*.cu'
-o
-iname
'*.inc'
| xargs
-n
1
-P
16
-I
{}
-t
sh
-c
'clang-format-12 -i -style=file {}'
git status
--porcelain
|
awk
'$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}'
| xargs
-n
1
-P
16
-I
{}
-t
sh
-c
'clang-format-12 -i -style=file {}'
test/CMakeLists.txt
View file @
05fd7ff8
...
...
@@ -140,6 +140,7 @@ add_subdirectory(block_to_ctile_map)
add_subdirectory
(
softmax
)
add_subdirectory
(
normalization_fwd
)
add_subdirectory
(
normalization_bwd_data
)
add_subdirectory
(
normalization_bwd_gamma_beta
)
add_subdirectory
(
data_type
)
add_subdirectory
(
elementwise_normalization
)
add_subdirectory
(
batchnorm
)
...
...
test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
View file @
05fd7ff8
...
...
@@ -135,6 +135,8 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
return
col2img
.
IsSupportedArgument
(
argument
);
}
throw
std
::
runtime_error
(
"Conv_tensor_rearrange: problem with tensor rearrange operator. "
);
return
1
;
}
};
...
...
test/normalization_bwd_gamma_beta/CMakeLists.txt
0 → 100644
View file @
05fd7ff8
add_custom_target
(
test_normalization_bwd_gamma_beta
)
add_gtest_executable
(
test_layernorm2d_bwd_gamma_beta_fp32 test_layernorm2d_bwd_gamma_beta_fp32.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_layernorm2d_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance
)
add_dependencies
(
test_normalization_bwd_gamma_beta test_layernorm2d_bwd_gamma_beta_fp32
)
endif
()
add_gtest_executable
(
test_groupnorm_bwd_gamma_beta_fp32 test_groupnorm_bwd_gamma_beta_fp32.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_groupnorm_bwd_gamma_beta_fp32 PRIVATE utility device_normalization_bwd_gamma_beta_instance
)
add_dependencies
(
test_normalization_bwd_gamma_beta test_groupnorm_bwd_gamma_beta_fp32
)
endif
()
test/normalization_bwd_gamma_beta/test_groupnorm_bwd_gamma_beta_fp32.cpp
0 → 100644
View file @
05fd7ff8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_groupnorm_bwd_gamma_beta_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestgroupnormBwdGammaBeta
:
public
::
testing
::
Test
{
protected:
using
DYDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
XDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
MeanInvStdDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
ComputeDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
DGammaDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
DBetaDataType
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
void
Run
()
{
// Bwd data: [N, H, W, G, C], reduce H, W, C
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{{
1
,
1
,
1
,
1
,
1
},
{
1
,
2
,
3
,
4
,
5
},
{
256
,
9
,
9
,
9
,
9
},
{
1
,
64
,
64
,
32
,
10
},
{
1
,
32
,
32
,
32
,
20
},
{
1
,
16
,
16
,
32
,
40
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_groupnorm_bwd_gamma_beta_impl
<
DYDataType
,
XDataType
,
MeanInvStdDataType
,
ComputeDataType
,
DGammaDataType
,
DBetaDataType
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType>
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
>>
;
TYPED_TEST_SUITE
(
TestgroupnormBwdGammaBeta
,
KernelTypes
);
TYPED_TEST
(
TestgroupnormBwdGammaBeta
,
Test_FP32
)
{
this
->
Run
();
}
test/normalization_bwd_gamma_beta/test_layernorm2d_bwd_gamma_beta_fp32.cpp
0 → 100644
View file @
05fd7ff8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_layernorm_bwd_gamma_beta_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestLayernorm2dBwdGammaBeta
:
public
::
testing
::
Test
{
protected:
using
DYDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
XDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
MeanInvStdDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
ComputeDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
DGammaDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
DBetaDataType
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
void
Run
()
{
// Bwd data: [N, D], reduce D
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{
{
4
,
256
},
{
8
,
511
},
{
9
,
1032
},
{
4
,
2048
},
{
1
,
8192
},
{
4000
,
2000
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_layernorm_bwd_gamma_beta_impl
<
DYDataType
,
XDataType
,
MeanInvStdDataType
,
ComputeDataType
,
DGammaDataType
,
DBetaDataType
,
2
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// DYDataType XDataType, MeanInvStdDataType, ComputeDataType, DGammaDataType, DBetaDataType>
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
,
F32
>>
;
TYPED_TEST_SUITE
(
TestLayernorm2dBwdGammaBeta
,
KernelTypes
);
TYPED_TEST
(
TestLayernorm2dBwdGammaBeta
,
Test_FP32
)
{
this
->
Run
();
}
test/wrapper/test_copy.cpp
View file @
05fd7ff8
...
...
@@ -21,49 +21,59 @@ template <typename InputTensor,
typename
OutputTensor
,
typename
BlockShape
,
typename
ThreadLayoutShape
,
typename
LocalTileSteps
,
typename
LocalPartitionSteps
>
bool
UseOptimizedCopy
>
__global__
void
TestCopyDevice
(
const
InputTensor
input_tensor
,
OutputTensor
output_tensor
,
const
BlockShape
tile_shape
,
const
ThreadLayoutShape
thread_layout
,
const
LocalTileSteps
block_steps
,
const
LocalPartitionSteps
thread_steps
)
const
ThreadLayoutShape
thread_layout
)
{
__shared__
ck
::
index_t
p_shared
[
ck
::
wrapper
::
size
(
tile_shape
)];
auto
tensor_lds
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
const
auto
tensor_lds
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
p_shared
,
ck
::
wrapper
::
make_layout
(
tile_shape
));
const
auto
block_idx
s
=
ck
::
make_tuple
(
ck
::
make_tuple
(
0
,
0
),
blockIdx
.
x
);
const
auto
block_idx
=
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
);
// Get local tiles for global memory
const
auto
input_local_tile
=
ck
::
wrapper
::
make_local_tile
(
input_tensor
,
tile_shape
,
block_idxs
,
block_steps
);
const
auto
input_local_tile
=
ck
::
wrapper
::
make_local_tile
(
input_tensor
,
tile_shape
,
block_idx
);
const
auto
output_local_tile
=
ck
::
wrapper
::
make_local_tile
(
output_tensor
,
tile_shape
,
block_idx
s
,
block_steps
);
ck
::
wrapper
::
make_local_tile
(
output_tensor
,
tile_shape
,
block_idx
);
// Get partition per thread
const
auto
input_local_partition
=
ck
::
wrapper
::
make_local_partition
(
input_local_tile
,
thread_layout
,
threadIdx
.
x
,
thread_steps
);
const
auto
input_local_partition
=
ck
::
wrapper
::
make_local_partition
(
input_local_tile
,
thread_layout
,
threadIdx
.
x
);
auto
lds_local_partition
=
ck
::
wrapper
::
make_local_partition
(
tensor_lds
,
thread_layout
,
threadIdx
.
x
,
thread_steps
);
auto
output_local_partition
=
ck
::
wrapper
::
make_local_partition
(
output_local_tile
,
thread_layout
,
threadIdx
.
x
,
thread_steps
);
ck
::
wrapper
::
make_local_partition
(
tensor_lds
,
thread_layout
,
threadIdx
.
x
);
auto
output_local_partition
=
ck
::
wrapper
::
make_local_partition
(
output_local_tile
,
thread_layout
,
threadIdx
.
x
);
// Allocate VGPR
constexpr
ck
::
index_t
scalar_per_vector
=
1
;
constexpr
ck
::
index_t
vgpr_size
=
ck
::
wrapper
::
size
(
lds_local_partition
);
auto
tensor_vgpr
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
vgpr_size
,
scalar_per_vector
,
ck
::
index_t
>
();
auto
tensor_vgpr
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
ck
::
index_t
>
(
layout
(
lds_local_partition
));
// Perform copy
ck
::
wrapper
::
copy
(
input_local_partition
,
lds_local_partition
);
ck
::
wrapper
::
copy
(
lds_local_partition
,
tensor_vgpr
);
ck
::
wrapper
::
copy
(
tensor_vgpr
,
output_local_partition
);
if
constexpr
(
UseOptimizedCopy
)
{
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
1
>
,
ck
::
Number
<
0
>>
;
constexpr
ck
::
index_t
vector_dim
=
0
;
constexpr
ck
::
index_t
scalar_per_vector
=
2
;
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
input_local_partition
,
lds_local_partition
);
// TODO: Enable optimized copy for static buffers
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
lds_local_partition
,
tensor_vgpr
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
tensor_vgpr
,
output_local_partition
);
}
else
{
ck
::
wrapper
::
copy
(
input_local_partition
,
lds_local_partition
);
ck
::
wrapper
::
copy
(
lds_local_partition
,
tensor_vgpr
);
ck
::
wrapper
::
copy
(
tensor_vgpr
,
output_local_partition
);
}
}
template
<
bool
UseOptimizedCopy
>
void
PerformCopyGlobalToGlobalViaLDS
()
{
const
auto
shape
=
...
...
@@ -89,15 +99,8 @@ void PerformCopyGlobalToGlobalViaLDS()
auto
output_tensor_global
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
ck
::
index_t
*>
(
out_buf
.
GetDeviceBuffer
()),
layout
);
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
32
>
{});
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
64
>
{});
const
auto
thread_steps
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
2
>
{});
const
auto
block_steps
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
64
>
{});
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
32
>
{});
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
64
>
{});
const
ck
::
index_t
grid_size
=
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
(
input_tensor_global
),
ck
::
wrapper
::
size
(
tile_shape
));
...
...
@@ -106,8 +109,7 @@ void PerformCopyGlobalToGlobalViaLDS()
decltype
(
output_tensor_global
),
decltype
(
tile_shape
),
decltype
(
thread_layout
),
decltype
(
block_steps
),
decltype
(
thread_steps
)
>
;
UseOptimizedCopy
>
;
launch_and_time_kernel
(
StreamConfig
{},
kernel
,
dim3
(
grid_size
),
...
...
@@ -116,9 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS()
input_tensor_global
,
output_tensor_global
,
tile_shape
,
thread_layout
,
block_steps
,
thread_steps
);
thread_layout
);
// Verify results
std
::
vector
<
ck
::
index_t
>
output_data
(
ck
::
wrapper
::
size
(
shape
));
...
...
@@ -126,4 +126,5 @@ void PerformCopyGlobalToGlobalViaLDS()
EXPECT_TRUE
(
ck
::
utils
::
check_err
(
output_data
,
input_data
));
}
TEST
(
TestCopy
,
CopyGlobalToGlobalViaLDS
)
{
PerformCopyGlobalToGlobalViaLDS
();
}
TEST
(
TestCopyGlobalToGlobalViaLDS
,
GenericCopy
)
{
PerformCopyGlobalToGlobalViaLDS
<
false
>
();
}
TEST
(
TestCopyGlobalToGlobalViaLDS
,
OptimizedCopy
)
{
PerformCopyGlobalToGlobalViaLDS
<
true
>
();
}
test/wrapper/test_partition.cpp
View file @
05fd7ff8
...
...
@@ -29,42 +29,29 @@ TEST(TestPartition, LocalPartition)
const
auto
tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Generic
>
(
data
.
data
(),
layout
);
const
auto
thread_steps
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
1
>
{});
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
1
>
{}),
ck
::
Number
<
1
>
{});
for
(
ck
::
index_t
thread_id
=
0
;
thread_id
<
ck
::
wrapper
::
size
(
thread_layout
);
thread_id
++
)
{
const
auto
raked_partition
=
ck
::
wrapper
::
make_local_partition
(
tensor
,
thread_layout
,
thread_id
);
const
auto
expected_partition_size
=
ck
::
wrapper
::
size
(
tensor
)
/
ck
::
wrapper
::
size
(
thread_layout
);
EXPECT_EQ
(
ck
::
wrapper
::
size
(
raked_partition
),
expected_partition_size
);
EXPECT_EQ
(
raked_partition
(
0
),
thread_id
);
}
const
auto
thread_steps
=
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
1
>
{});
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
1
>
{});
for
(
ck
::
index_t
thread_id
=
0
;
thread_id
<
ck
::
wrapper
::
size
(
thread_layout
);
thread_id
++
)
{
const
auto
packed_partition
=
ck
::
wrapper
::
make_local_partition
(
tensor
,
thread_layout
,
thread_id
,
thread_steps
);
ck
::
wrapper
::
make_local_partition
(
tensor
,
thread_layout
,
thread_id
);
const
auto
expected_partition_size
=
ck
::
wrapper
::
size
(
tensor
)
/
ck
::
wrapper
::
size
(
thread_layout
);
const
auto
expected_partition_first_val
=
thread_id
*
ck
::
wrapper
::
size
<
0
,
0
>
(
thread_steps
);
const
auto
expected_partition_first_val
=
thread_id
*
ck
::
wrapper
::
size
<
0
>
(
thread_steps
);
const
auto
expected_partition_second_val
=
expected_partition_first_val
+
1
;
EXPECT_EQ
(
ck
::
wrapper
::
size
(
packed_partition
),
expected_partition_size
);
EXPECT_EQ
(
packed_partition
(
0
),
expected_partition_first_val
);
EXPECT_EQ
(
packed_partition
(
1
),
expected_partition_second_val
);
}
}
TEST
(
TestPartition
,
LocalTile
)
{
const
auto
shape
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
16
>
{},
ck
::
Number
<
4
>
{}),
ck
::
Number
<
4
>
{});
const
auto
strides
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
16
>
{}),
ck
::
Number
<
64
>
{});
const
auto
layout
=
ck
::
wrapper
::
make_layout
(
shape
,
strides
);
const
auto
shape
=
ck
::
make_tuple
(
ck
::
Number
<
16
>
{},
ck
::
Number
<
4
>
{},
ck
::
Number
<
4
>
{});
const
auto
strides
=
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
16
>
{},
ck
::
Number
<
64
>
{});
const
auto
layout
=
ck
::
wrapper
::
make_layout
(
shape
,
strides
);
std
::
vector
<
ck
::
index_t
>
data
(
ck
::
wrapper
::
size
(
layout
));
std
::
iota
(
data
.
begin
(),
data
.
end
(),
0
);
...
...
@@ -72,48 +59,34 @@ TEST(TestPartition, LocalTile)
const
auto
tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Generic
>
(
data
.
data
(),
layout
);
const
auto
block_steps
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
2
>
{});
const
auto
block_shape
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
2
>
{});
const
auto
block_layout
=
ck
::
make_tuple
(
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
2
>
{}),
ck
::
Number
<
2
>
{});
std
::
vector
<
ck
::
Tuple
<
ck
::
Tuple
<
ck
::
index_t
,
ck
::
index_t
>
,
ck
::
index_t
>>
block_idxs
;
for
(
ck
::
index_t
x
=
0
;
x
<
ck
::
wrapper
::
size
<
0
,
0
>
(
block_layout
);
x
++
)
{
for
(
ck
::
index_t
y
=
0
;
y
<
ck
::
wrapper
::
size
<
0
,
1
>
(
block_layout
);
y
++
)
{
for
(
ck
::
index_t
z
=
0
;
z
<
ck
::
wrapper
::
size
<
1
>
(
block_layout
);
z
++
)
{
block_idxs
.
emplace_back
(
ck
::
make_tuple
(
x
,
y
),
z
);
}
}
}
for
(
const
auto
&
block_idx
:
block_idxs
)
{
const
auto
raked_tile
=
ck
::
wrapper
::
make_local_tile
(
tensor
,
block_shape
,
block_idx
);
const
auto
expected_tile_size
=
ck
::
wrapper
::
size
(
block_shape
);
EXPECT_EQ
(
ck
::
wrapper
::
size
(
raked_tile
),
expected_tile_size
);
EXPECT_EQ
(
raked_tile
(
0
),
layout
(
block_idx
));
}
const
auto
block_shape
=
ck
::
make_tuple
(
ck
::
Number
<
2
>
{},
ck
::
Number
<
4
>
{},
ck
::
Number
<
2
>
{});
const
auto
num_blocks
=
ck
::
make_tuple
(
ck
::
wrapper
::
size
<
0
>
(
shape
)
/
ck
::
wrapper
::
size
<
0
>
(
block_shape
),
ck
::
wrapper
::
size
<
1
>
(
shape
)
/
ck
::
wrapper
::
size
<
1
>
(
block_shape
),
ck
::
wrapper
::
size
<
2
>
(
shape
)
/
ck
::
wrapper
::
size
<
2
>
(
block_shape
));
std
::
vector
<
ck
::
index_t
>
block_idxs
(
ck
::
wrapper
::
size
(
num_blocks
));
std
::
iota
(
block_idxs
.
begin
(),
block_idxs
.
end
(),
0
);
for
(
const
auto
&
block_idx
:
block_idxs
)
for
(
auto
block_idx
:
block_idxs
)
{
const
auto
packed_tile
=
ck
::
wrapper
::
make_local_tile
(
tensor
,
block_shape
,
block_idx
,
block_steps
);
const
auto
packed_tile
=
ck
::
wrapper
::
make_local_tile
(
tensor
,
block_shape
,
block_idx
);
const
auto
expected_tile_size
=
ck
::
wrapper
::
size
(
block_shape
);
const
auto
expected_tile_first_val
=
ck
::
wrapper
::
size
<
0
,
0
>
(
block_idx
)
*
ck
::
wrapper
::
size
<
0
,
0
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
0
,
0
>
(
strides
)
+
ck
::
wrapper
::
size
<
0
,
1
>
(
block_idx
)
*
ck
::
wrapper
::
size
<
0
,
1
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
0
,
1
>
(
strides
)
+
ck
::
wrapper
::
size
<
1
>
(
block_idx
)
*
ck
::
wrapper
::
size
<
1
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
1
>
(
strides
);
auto
expected_tile_first_val
=
(
block_idx
%
ck
::
wrapper
::
size
<
2
>
(
num_blocks
))
*
ck
::
wrapper
::
size
<
2
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
2
>
(
strides
);
block_idx
/=
ck
::
wrapper
::
size
<
2
>
(
num_blocks
);
expected_tile_first_val
+=
(
block_idx
%
ck
::
wrapper
::
size
<
1
>
(
num_blocks
))
*
ck
::
wrapper
::
size
<
1
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
1
>
(
strides
);
block_idx
/=
ck
::
wrapper
::
size
<
1
>
(
num_blocks
);
expected_tile_first_val
+=
(
block_idx
%
ck
::
wrapper
::
size
<
0
>
(
num_blocks
))
*
ck
::
wrapper
::
size
<
0
>
(
block_shape
)
*
ck
::
wrapper
::
size
<
0
>
(
strides
);
const
auto
expected_tile_second_val
=
expected_tile_first_val
+
1
;
EXPECT_EQ
(
ck
::
wrapper
::
size
(
packed_tile
),
expected_tile_size
);
EXPECT_EQ
(
packed_tile
(
0
),
expected_tile_first_val
);
EXPECT_EQ
(
packed_tile
(
1
),
expected_tile_second_val
);
}
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment