Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9684677a
Unverified
Commit
9684677a
authored
Nov 08, 2023
by
arai713
Committed by
GitHub
Nov 08, 2023
Browse files
Merge branch 'develop' into hip_tensor_permute
parents
36f6966a
98fd41f5
Changes
182
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2127 additions
and
202 deletions
+2127
-202
library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
..._instance/gpu/contraction/device_contraction_instance.hpp
+292
-0
library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
...ry/tensor_operation_instance/gpu/contraction_bilinear.hpp
+389
-35
library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
...brary/tensor_operation_instance/gpu/contraction_scale.hpp
+387
-33
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
...y/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
+279
-54
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
...include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+25
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_instance.hpp
..._grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_instance.hpp
+131
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
...ration_instance/gpu/grouped_convolution_backward_data.hpp
+34
-15
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
...tion_instance/gpu/grouped_convolution_backward_weight.hpp
+46
-19
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
...or_operation_instance/gpu/grouped_convolution_forward.hpp
+28
-23
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
...pu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
+176
-0
library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
...ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+44
-0
library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt
...sor_operation_instance/gpu/column_to_image/CMakeLists.txt
+6
-3
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gndhwc_3d_instance.cpp
...mn_to_image/device_column_to_image_gndhwc_3d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnhwc_2d_instance.cpp
...umn_to_image/device_column_to_image_gnhwc_2d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnwc_1d_instance.cpp
...lumn_to_image/device_column_to_image_gnwc_1d_instance.cpp
+4
-4
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
...mn_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
+62
-0
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwgc_2d_instance.cpp
...umn_to_image/device_column_to_image_nhwgc_2d_instance.cpp
+62
-0
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nwgc_1d_instance.cpp
...lumn_to_image/device_column_to_image_nwgc_1d_instance.cpp
+61
-0
library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
...peration_instance/gpu/contraction_bilinear/CMakeLists.txt
+36
-8
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
...shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+57
-0
No files found.
library/include/ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp
0 → 100644
View file @
9684677a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
F32
=
float
;
using
F64
=
double
;
using
F16_Tuple
=
ck
::
Tuple
<
F16
>
;
using
BF16_Tuple
=
ck
::
Tuple
<
BF16
>
;
using
F32_Tuple
=
ck
::
Tuple
<
F32
>
;
using
F64_Tuple
=
ck
::
Tuple
<
F64
>
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Bilinear
=
ck
::
tensor_operation
::
element_wise
::
Bilinear
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
static
constexpr
auto
GemmMNKPadding
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
ComputeDataType
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
>
using
device_contraction_kk_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
256
,
16
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
64
,
64
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
32
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
32
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
64
,
64
,
32
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
64
,
32
,
64
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
4
,
ComputeDataType
>
// clang-format on
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
ComputeDataType
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
>
using
device_contraction_kn_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
16
,
4
,
1
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
256
,
16
,
4
,
1
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
256
,
16
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
128
,
16
,
4
,
1
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
4
,
1
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
4
,
1
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
4
,
1
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
4
,
1
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
4
,
1
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
// clang-format on
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
ComputeDataType
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
>
using
device_contraction_mk_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
16
,
1
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
256
,
16
,
1
,
4
,
32
,
32
,
2
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
256
,
16
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
128
,
16
,
1
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
1
,
4
,
32
,
32
,
2
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
1
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
1
,
4
,
32
,
32
,
2
,
2
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
1
,
4
,
32
,
32
,
2
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
1
,
4
,
32
,
32
,
1
,
2
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
// clang-format on
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
ComputeDataType
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
>
using
device_contraction_mn_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
16
,
1
,
1
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
256
,
16
,
1
,
1
,
32
,
32
,
2
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
256
,
16
,
4
,
4
,
32
,
32
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
128
,
16
,
1
,
1
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
1
,
1
,
32
,
32
,
2
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
1
,
1
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
1
,
1
,
32
,
32
,
2
,
2
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
1
,
1
,
32
,
32
,
2
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
1
,
1
,
32
,
32
,
1
,
2
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
4
,
4
,
32
,
32
,
1
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
2
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
ComputeDataType
>
// clang-format on
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
ComputeDataType
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
>
using
device_contraction_f64_kk_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
64
,
64
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
32
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
32
,
128
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
64
,
64
,
32
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
64
,
32
,
64
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
,
ComputeDataType
>
// clang-format on
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
ComputeDataType
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
>
using
device_contraction_f64_kn_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
1
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
1
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
1
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
1
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
1
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
// clang-format on
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
ComputeDataType
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
>
using
device_contraction_f64_mk_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
1
,
2
,
16
,
16
,
4
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
1
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
1
,
2
,
16
,
16
,
4
,
4
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
1
,
2
,
16
,
16
,
4
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
1
,
2
,
16
,
16
,
2
,
4
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
// clang-format on
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
typename
EDataType
,
typename
ComputeDataType
,
typename
AElementwiseOp
,
typename
BElementwiseOp
,
typename
CDEElementwiseOp
>
using
device_contraction_f64_mn_instance
=
std
::
tuple
<
// clang-format off
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
1
,
1
,
16
,
16
,
4
,
4
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
1
,
1
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
1
,
1
,
16
,
16
,
4
,
4
,
S
<
8
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
128
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
4
,
4
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
1
,
1
,
16
,
16
,
4
,
2
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
128
,
64
,
16
,
2
,
2
,
16
,
16
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
1
,
1
,
16
,
16
,
2
,
4
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
,
DeviceContractionMultipleD_Xdl_CShuffle
<
2
,
2
,
2
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
GemmMNKPadding
,
1
,
256
,
64
,
128
,
16
,
2
,
2
,
16
,
16
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
S
<
4
,
64
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
1
,
ComputeDataType
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
View file @
9684677a
...
...
@@ -17,7 +17,6 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_FP32
// float
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
...
...
@@ -28,7 +27,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn
F32
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -40,7 +40,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn
F32
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -52,7 +53,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn
F32
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -64,10 +66,115 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn
F32
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
#endif
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
F32_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
,
F16
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
F32_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
,
F16
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
F32_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
,
F16
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
F32_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
,
F16
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
F32_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
,
BF16
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
F32_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
,
BF16
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
F32_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
,
BF16
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
F32_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
,
BF16
>>>&
instances
);
#endif // CK_ENABLE_FP32
#ifdef CK_ENABLE_FP64
// double
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
...
...
@@ -78,7 +185,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn
F64
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
Bilinear
,
F64
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -90,7 +198,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn
F64
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
Bilinear
,
F64
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -102,7 +211,8 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn
F64
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
Bilinear
,
F64
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -114,8 +224,170 @@ void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn
F64
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
#endif
Bilinear
,
F64
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
F64_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
F64_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
F64_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
F64_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
#endif // CK_ENABLE_FP64
#ifdef CK_ENABLE_FP16
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F16
,
F16
,
F16_Tuple
,
F16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F16
,
F16
,
F16_Tuple
,
F16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F16
,
F16
,
F16_Tuple
,
F16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F16
,
F16
,
F16_Tuple
,
F16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
#endif // CK_ENABLE_FP16
#ifdef CK_ENABLE_BF16
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
BF16_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
BF16_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
BF16_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
BF16_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
);
#endif // CK_ENABLE_FP16
// Contraction + Bilinear
template
<
index_t
NumDimM
,
index_t
NumDimN
,
...
...
@@ -123,7 +395,8 @@ template <index_t NumDimM,
typename
ADataType
,
typename
BDataType
,
typename
DDataType
,
typename
EDataType
>
typename
EDataType
,
typename
ComputeDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
...
...
@@ -134,7 +407,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Bilinear
>>
ck
::
tensor_operation
::
element_wise
::
Bilinear
,
ComputeDataType
>>
{
using
DeviceOp
=
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
...
...
@@ -145,45 +419,125 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Bilinear
>
;
ck
::
tensor_operation
::
element_wise
::
Bilinear
,
ComputeDataType
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
ADataType
,
float
>
&&
is_same_v
<
BDataType
,
float
>
&&
is_same_v
<
DDataType
,
float
>
&&
is_same_v
<
EDataType
,
float
>
)
is_same_v
<
EDataType
,
float
>
)
{
if
constexpr
(
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance
(
op_ptrs
);
if
constexpr
(
is_same_v
<
ComputeDataType
,
float
>
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ComputeDataType
,
ck
::
half_t
>
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ComputeDataType
,
ck
::
bhalf_t
>
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance
(
op_ptrs
);
}
}
}
#endif
#endif
// CK_ENABLE_FP32
#ifdef CK_ENABLE_FP64
if
constexpr
(
is_same_v
<
ADataType
,
double
>
&&
is_same_v
<
BDataType
,
double
>
&&
is_same_v
<
DDataType
,
double
>
&&
is_same_v
<
EDataType
,
double
>
)
is_same_v
<
EDataType
,
double
>
)
{
if
constexpr
(
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
)
{
if
constexpr
(
is_same_v
<
ComputeDataType
,
double
>
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ComputeDataType
,
float
>
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance
(
op_ptrs
);
}
}
}
#endif // CK_ENABLE_FP64
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
ADataType
,
ck
::
half_t
>
&&
is_same_v
<
BDataType
,
ck
::
half_t
>
&&
is_same_v
<
EDataType
,
ck
::
half_t
>
)
{
if
constexpr
(
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
)
{
if
constexpr
(
is_same_v
<
ComputeDataType
,
float
>
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance
(
op_ptrs
);
}
}
}
#endif // CK_ENABLE_FP16
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
ADataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
BDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
EDataType
,
ck
::
bhalf_t
>
)
{
if
constexpr
(
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance
(
op_ptrs
);
if
constexpr
(
is_same_v
<
ComputeDataType
,
float
>
)
{
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance
(
op_ptrs
);
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance
(
op_ptrs
);
}
}
}
#endif
#endif
// CK_ENABLE_BF16
return
op_ptrs
;
}
};
...
...
library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
View file @
9684677a
...
...
@@ -17,7 +17,6 @@ namespace tensor_operation {
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_FP32
// float
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
...
...
@@ -28,7 +27,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instanc
F32
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
);
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -40,7 +40,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instanc
F32
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
);
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -52,7 +53,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instanc
F32
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
);
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -64,10 +66,115 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instanc
F32
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
);
#endif
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F16
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F16
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F16
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F16
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
BF16
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
BF16
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
BF16
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
BF16
>>>&
instances
);
#endif // CK_ENABLE_FP32
#ifdef CK_ENABLE_FP64
// double
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
...
...
@@ -78,7 +185,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instanc
F64
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
);
Scale
,
F64
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -90,7 +198,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instanc
F64
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
);
Scale
,
F64
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -102,7 +211,8 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instanc
F64
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
);
Scale
,
F64
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
...
...
@@ -114,15 +224,178 @@ void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instanc
F64
,
PassThrough
,
PassThrough
,
Scale
>>>&
instances
);
#endif
Scale
,
F64
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
#endif // CK_ENABLE_FP64
#ifdef CK_ENABLE_FP16
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
#endif // CK_ENABLE_FP16
#ifdef CK_ENABLE_BF16
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
void
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
Empty_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
);
#endif // CK_ENABLE_FP16
// Contraction + Scale
template
<
index_t
NumDimM
,
index_t
NumDimN
,
index_t
NumDimK
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
>
typename
EDataType
,
typename
ComputeDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
...
...
@@ -133,7 +406,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Scale
>>
ck
::
tensor_operation
::
element_wise
::
Scale
,
ComputeDataType
>>
{
using
DeviceOp
=
DeviceContractionMultipleD
<
NumDimM
,
NumDimN
,
...
...
@@ -144,7 +418,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Scale
>
;
ck
::
tensor_operation
::
element_wise
::
Scale
,
ComputeDataType
>
;
static
auto
GetInstances
()
{
...
...
@@ -155,34 +430,113 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContra
{
if
constexpr
(
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance
(
op_ptrs
);
if
constexpr
(
is_same_v
<
ComputeDataType
,
float
>
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ComputeDataType
,
ck
::
half_t
>
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ComputeDataType
,
ck
::
bhalf_t
>
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance
(
op_ptrs
);
}
}
}
#endif
#endif
// CK_ENABLE_FP32
#ifdef CK_ENABLE_FP64
if
constexpr
(
is_same_v
<
ADataType
,
double
>
&&
is_same_v
<
BDataType
,
double
>
&&
is_same_v
<
EDataType
,
double
>
)
{
if
constexpr
(
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
(
op_ptrs
);
if
constexpr
(
is_same_v
<
ComputeDataType
,
double
>
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ComputeDataType
,
float
>
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance
(
op_ptrs
);
}
}
}
#endif // CK_ENABLE_FP64
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
ADataType
,
ck
::
half_t
>
&&
is_same_v
<
BDataType
,
ck
::
half_t
>
&&
is_same_v
<
EDataType
,
ck
::
half_t
>
)
{
if
constexpr
(
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
)
{
if
constexpr
(
is_same_v
<
ComputeDataType
,
float
>
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance
(
op_ptrs
);
}
}
}
#endif // CK_ENABLE_FP16
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
ADataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
BDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
EDataType
,
ck
::
bhalf_t
>
)
{
if
constexpr
(
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
2
)
{
if
constexpr
(
is_same_v
<
ComputeDataType
,
float
>
)
{
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance
(
op_ptrs
);
add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance
(
op_ptrs
);
}
}
}
#endif
#endif
// CK_ENABLE_BF16
return
op_ptrs
;
}
};
...
...
library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
View file @
9684677a
...
...
@@ -19,109 +19,214 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
// GNWC/GNHWC/GNDHWC
// Image to Column
//
nhwc
, 1d
void
add_device_image_to_column_nwc_1d_bf16_instances
(
//
GNWC
, 1d
void
add_device_image_to_column_
g
nwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwc_1d_f16_instances
(
void
add_device_image_to_column_
g
nwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwc_1d_f32_instances
(
void
add_device_image_to_column_
g
nwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwc_1d_i8_instances
(
void
add_device_image_to_column_
g
nwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
//
nhwc
, 2d
void
add_device_image_to_column_nhwc_2d_bf16_instances
(
//
GNHWC
, 2d
void
add_device_image_to_column_
g
nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f16_instances
(
void
add_device_image_to_column_
g
nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_f32_instances
(
void
add_device_image_to_column_
g
nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwc_2d_i8_instances
(
void
add_device_image_to_column_
g
nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
//
nhwc
, 3d
void
add_device_image_to_column_ndhwc_3d_bf16_instances
(
//
GNDHWC
, 3d
void
add_device_image_to_column_
g
ndhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwc_3d_f16_instances
(
void
add_device_image_to_column_
g
ndhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwc_3d_f32_instances
(
void
add_device_image_to_column_
g
ndhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwc_3d_i8_instances
(
void
add_device_image_to_column_
g
ndhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// Column to Image
//
nhwc
, 1d
void
add_device_column_to_image_nwc_1d_bf16_instances
(
//
GNWC
, 1d
void
add_device_column_to_image_
g
nwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwc_1d_f16_instances
(
void
add_device_column_to_image_
g
nwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwc_1d_f32_instances
(
void
add_device_column_to_image_
g
nwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwc_1d_i8_instances
(
void
add_device_column_to_image_
g
nwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
//
nhwc
, 2d
void
add_device_column_to_image_nhwc_2d_bf16_instances
(
//
GNHWC
, 2d
void
add_device_column_to_image_
g
nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_f16_instances
(
void
add_device_column_to_image_
g
nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_f32_instances
(
void
add_device_column_to_image_
g
nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwc_2d_i8_instances
(
void
add_device_column_to_image_
g
nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
//
nhwc
, 3d
void
add_device_column_to_image_ndhwc_3d_bf16_instances
(
//
GNDHWC
, 3d
void
add_device_column_to_image_
g
ndhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwc_3d_f16_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwc_3d_f32_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwc_3d_i8_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
// NWGC/NHWGC/NDHWGC
// Image to Column
// NWGC, 1d
void
add_device_image_to_column_nwgc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwgc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwgc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nwgc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// NHWGC, 2d
void
add_device_image_to_column_nhwgc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwgc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwgc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_nhwgc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// NDHWGC, 3d
void
add_device_image_to_column_ndhwgc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
BF16
,
BF16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwgc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F16
,
F16
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwgc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F32
,
F32
,
ImageToColumn
>>>&
instances
);
void
add_device_image_to_column_ndhwgc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
int8_t
,
int8_t
,
ImageToColumn
>>>&
instances
);
// Column to Image
// NWGC, 1d
void
add_device_column_to_image_nwgc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwgc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwgc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nwgc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
// NHWGC, 2d
void
add_device_column_to_image_nhwgc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwgc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwgc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_nhwgc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
// NDHWGC, 3d
void
add_device_column_to_image_ndhwgc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwgc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwgc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
);
void
add_device_column_to_image_ndhwgc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
);
template
<
ck
::
index_t
NumDimSpatial
,
typename
ImageLayout
,
...
...
@@ -151,60 +256,120 @@ struct DeviceOperationInstanceFactory<
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nwc_1d_f32_instances
(
op_ptrs
);
add_device_image_to_column_
g
nwc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nwc_1d_f16_instances
(
op_ptrs
);
add_device_image_to_column_
g
nwc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nwc_1d_bf16_instances
(
op_ptrs
);
add_device_image_to_column_
g
nwc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nwc_1d_i8_instances
(
op_ptrs
);
add_device_image_to_column_
g
nwc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
ImageLayout
,
GNHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwc_2d_f32_instances
(
op_ptrs
);
add_device_image_to_column_
g
nhwc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwc_2d_f16_instances
(
op_ptrs
);
add_device_image_to_column_
g
nhwc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwc_2d_bf16_instances
(
op_ptrs
);
add_device_image_to_column_
g
nhwc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwc_2d_i8_instances
(
op_ptrs
);
add_device_image_to_column_
g
nhwc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
ImageLayout
,
GNDHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_ndhwc_3d_f32_instances
(
op_ptrs
);
add_device_image_to_column_
g
ndhwc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_ndhwc_3d_f16_instances
(
op_ptrs
);
add_device_image_to_column_
g
ndhwc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_ndhwc_3d_bf16_instances
(
op_ptrs
);
add_device_image_to_column_
g
ndhwc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_ndhwc_3d_i8_instances
(
op_ptrs
);
add_device_image_to_column_gndhwc_3d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
ImageLayout
,
NWGC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nwgc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nwgc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nwgc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nwgc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
ImageLayout
,
NHWGC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_nhwgc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_nhwgc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_nhwgc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_nhwgc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
ImageLayout
,
NDHWGC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_image_to_column_ndhwgc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_image_to_column_ndhwgc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_image_to_column_ndhwgc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_image_to_column_ndhwgc_3d_i8_instances
(
op_ptrs
);
}
}
}
...
...
@@ -214,60 +379,120 @@ struct DeviceOperationInstanceFactory<
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nwc_1d_f32_instances
(
op_ptrs
);
add_device_column_to_image_
g
nwc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nwc_1d_f16_instances
(
op_ptrs
);
add_device_column_to_image_
g
nwc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nwc_1d_bf16_instances
(
op_ptrs
);
add_device_column_to_image_
g
nwc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nwc_1d_i8_instances
(
op_ptrs
);
add_device_column_to_image_
g
nwc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
ImageLayout
,
GNHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nhwc_2d_f32_instances
(
op_ptrs
);
add_device_column_to_image_
g
nhwc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nhwc_2d_f16_instances
(
op_ptrs
);
add_device_column_to_image_
g
nhwc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nhwc_2d_bf16_instances
(
op_ptrs
);
add_device_column_to_image_
g
nhwc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nhwc_2d_i8_instances
(
op_ptrs
);
add_device_column_to_image_
g
nhwc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
ImageLayout
,
GNDHWC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_ndhwc_3d_f32_instances
(
op_ptrs
);
add_device_column_to_image_gndhwc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_gndhwc_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_gndhwc_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_gndhwc_3d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
1
&&
is_same_v
<
ImageLayout
,
NWGC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nwgc_1d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nwgc_1d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nwgc_1d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nwgc_1d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
2
&&
is_same_v
<
ImageLayout
,
NHWGC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_nhwgc_2d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_nhwgc_2d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_nhwgc_2d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_nhwgc_2d_i8_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
ImageLayout
,
NDHWGC
>
)
{
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_column_to_image_ndhwgc_3d_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
{
add_device_column_to_image_ndhwc_3d_f16_instances
(
op_ptrs
);
add_device_column_to_image_ndhw
g
c_3d_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_column_to_image_ndhwc_3d_bf16_instances
(
op_ptrs
);
add_device_column_to_image_ndhw
g
c_3d_bf16_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_column_to_image_ndhwc_3d_i8_instances
(
op_ptrs
);
add_device_column_to_image_ndhw
g
c_3d_i8_instances
(
op_ptrs
);
}
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
View file @
9684677a
...
...
@@ -328,7 +328,18 @@ void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(
void
add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Col
,
Row
,
F8
,
F8
,
F8
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
...
...
@@ -548,6 +559,20 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ADataType
,
ck
::
half_t
>
&&
is_same_v
<
BDataType
,
ck
::
f8_t
>
&&
is_same_v
<
CDataType
,
ck
::
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_kn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_instance.hpp
0 → 100644
View file @
9684677a
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
namespace
ck
::
tensor_layout
::
convolution
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ScaleAddScaleAddRelu
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
ConvFwd1x1P0
=
ConvolutionForwardSpecialization
::
Filter1x1Pad0
;
static
constexpr
auto
ConvFwd1x1S1P0
=
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
;
static
constexpr
auto
ConvFwdOddC
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
OddC
;
static
constexpr
auto
GemmMNKPadding
=
GemmSpecialization
::
MNKPadding
;
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_bf16_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
BF16
,
BF16
,
F32
,
BF16
,
ck
::
Tuple
<
BF16
,
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
>
,
// instances for small conv.K and conv.C
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
BF16
,
BF16
,
F32
,
BF16
,
ck
::
Tuple
<
BF16
,
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
BF16
,
BF16
,
F32
,
BF16
,
ck
::
Tuple
<
BF16
,
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
BF16
,
BF16
,
F32
,
BF16
,
ck
::
Tuple
<
BF16
,
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
// clang-format on
>
;
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f16_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
>
,
// instances for small conv.K and conv.C
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F16
,
F16
,
F32
,
F16
,
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
// clang-format on
>
;
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_f32_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F32
,
F32
,
F32
,
F32
,
ck
::
Tuple
<
F32
,
F32
>
,
F32
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
64
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
>
,
// instances for small conv.K and conv.C
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F32
,
F32
,
F32
,
F32
,
ck
::
Tuple
<
F32
,
F32
>
,
F32
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
32
,
16
,
4
,
4
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
8
,
1
,
8
>
,
1
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F32
,
F32
,
F32
,
F32
,
ck
::
Tuple
<
F32
,
F32
>
,
F32
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
16
,
4
,
4
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
F32
,
F32
,
F32
,
F32
,
ck
::
Tuple
<
F32
,
F32
>
,
F32
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
>
// clang-format on
>
;
template
<
index_t
NDimSpatial
,
typename
ALayout
,
typename
BLayout
,
typename
DsLayout
,
typename
ELayout
,
ConvolutionForwardSpecialization
ConvSpec
>
using
device_grouped_conv_fwd_xdl_scaleadd_scaleadd_relu_int8_instances
=
std
::
tuple
<
// clang-format off
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// generic instance
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
int8_t
,
int8_t
,
int32_t
,
int8_t
,
ck
::
Tuple
<
F32
,
F32
>
,
int8_t
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
64
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
>
,
// instances for small conv.K and conv.C
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
int8_t
,
int8_t
,
int32_t
,
int8_t
,
ck
::
Tuple
<
F32
,
F32
>
,
int8_t
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
64
,
64
,
32
,
32
,
8
,
8
,
32
,
32
,
2
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
1
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
int8_t
,
int8_t
,
int32_t
,
int8_t
,
ck
::
Tuple
<
F32
,
F32
>
,
int8_t
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
128
,
128
,
32
,
8
,
8
,
32
,
32
,
2
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
1
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
,
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
<
NDimSpatial
,
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
int8_t
,
int8_t
,
int32_t
,
int8_t
,
ck
::
Tuple
<
F32
,
F32
>
,
int8_t
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
,
ConvSpec
,
GemmMNKPadding
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
// clang-format on
>
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
View file @
9684677a
...
...
@@ -496,7 +496,8 @@ struct DeviceOperationInstanceFactory<
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
F16
>
&&
is_same_v
<
WeiDataType
,
F16
>
&&
is_same_v
<
OutDataType
,
F16
>
)
is_same_v
<
OutDataType
,
F16
>
&&
is_same_v
<
ComputeTypeA
,
F16
>
&&
is_same_v
<
ComputeTypeB
,
F16
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_f16_instances
(
...
...
@@ -507,14 +508,16 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
WeiDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
)
is_same_v
<
OutDataType
,
F32
>
&&
is_same_v
<
ComputeTypeA
,
F32
>
&&
is_same_v
<
ComputeTypeB
,
F32
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
WeiDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
)
is_same_v
<
OutDataType
,
BF16
>
&&
is_same_v
<
ComputeTypeA
,
BF16
>
&&
is_same_v
<
ComputeTypeB
,
BF16
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances
(
op_ptrs
);
...
...
@@ -522,7 +525,9 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeTypeA
,
int8_t
>
&&
is_same_v
<
ComputeTypeB
,
int8_t
>
)
{
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_gnhwk_gkyxc_gnhwc_i8_1x1s1p0_instances
(
...
...
@@ -535,7 +540,8 @@ struct DeviceOperationInstanceFactory<
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
F16
>
&&
is_same_v
<
WeiDataType
,
F16
>
&&
is_same_v
<
OutDataType
,
F16
>
)
is_same_v
<
OutDataType
,
F16
>
&&
is_same_v
<
ComputeTypeA
,
F16
>
&&
is_same_v
<
ComputeTypeB
,
F16
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_f16_instances
(
...
...
@@ -546,14 +552,16 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
WeiDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
)
is_same_v
<
OutDataType
,
F32
>
&&
is_same_v
<
ComputeTypeA
,
F32
>
&&
is_same_v
<
ComputeTypeB
,
F32
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
WeiDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
)
is_same_v
<
OutDataType
,
BF16
>
&&
is_same_v
<
ComputeTypeA
,
BF16
>
&&
is_same_v
<
ComputeTypeB
,
BF16
>
)
{
add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances
(
op_ptrs
);
...
...
@@ -561,7 +569,9 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeTypeA
,
int8_t
>
&&
is_same_v
<
ComputeTypeB
,
int8_t
>
)
{
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_instances
(
op_ptrs
);
add_device_grouped_conv2d_bwd_data_wmma_nhwgk_gkyxc_nhwgc_i8_1x1s1p0_instances
(
...
...
@@ -578,7 +588,8 @@ struct DeviceOperationInstanceFactory<
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
F16
>
&&
is_same_v
<
WeiDataType
,
F16
>
&&
is_same_v
<
OutDataType
,
F16
>
)
is_same_v
<
OutDataType
,
F16
>
&&
is_same_v
<
ComputeTypeA
,
F16
>
&&
is_same_v
<
ComputeTypeB
,
F16
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances
(
op_ptrs
);
...
...
@@ -590,7 +601,8 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
WeiDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
)
is_same_v
<
OutDataType
,
F32
>
&&
is_same_v
<
ComputeTypeA
,
F32
>
&&
is_same_v
<
ComputeTypeB
,
F32
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances
(
op_ptrs
);
...
...
@@ -598,7 +610,8 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
WeiDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
)
is_same_v
<
OutDataType
,
BF16
>
&&
is_same_v
<
ComputeTypeA
,
BF16
>
&&
is_same_v
<
ComputeTypeB
,
BF16
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances
(
op_ptrs
);
...
...
@@ -606,7 +619,9 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeTypeA
,
int8_t
>
&&
is_same_v
<
ComputeTypeB
,
int8_t
>
)
{
add_device_grouped_conv3d_bwd_data_wmma_gndhwk_gkzyxc_gndhwc_i8_instances
(
op_ptrs
);
...
...
@@ -642,7 +657,8 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
InDataType
,
F32
>
&&
is_same_v
<
WeiDataType
,
F32
>
&&
is_same_v
<
OutDataType
,
F32
>
)
is_same_v
<
OutDataType
,
F32
>
&&
is_same_v
<
ComputeTypeA
,
F32
>
&&
is_same_v
<
ComputeTypeB
,
F32
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances
(
op_ptrs
);
...
...
@@ -650,7 +666,8 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
InDataType
,
BF16
>
&&
is_same_v
<
WeiDataType
,
BF16
>
&&
is_same_v
<
OutDataType
,
BF16
>
)
is_same_v
<
OutDataType
,
BF16
>
&&
is_same_v
<
ComputeTypeA
,
BF16
>
&&
is_same_v
<
ComputeTypeB
,
BF16
>
)
{
add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances
(
op_ptrs
);
...
...
@@ -658,7 +675,9 @@ struct DeviceOperationInstanceFactory<
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeTypeA
,
int8_t
>
&&
is_same_v
<
ComputeTypeB
,
int8_t
>
)
{
add_device_grouped_conv3d_bwd_data_wmma_ndhwgk_gkzyxc_ndhwgc_i8_instances
(
op_ptrs
);
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
View file @
9684677a
...
...
@@ -618,7 +618,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeTypeA
,
float
>
&&
is_same_v
<
ComputeTypeB
,
float
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instances
(
op_ptrs
);
...
...
@@ -628,7 +629,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeTypeA
,
half_t
>
&&
is_same_v
<
ComputeTypeB
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instances
(
op_ptrs
);
...
...
@@ -638,7 +640,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeA
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeB
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances
(
...
...
@@ -655,21 +659,25 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#ifdef DL_KERNELS
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeTypeA
,
float
>
&&
is_same_v
<
ComputeTypeB
,
float
>
)
{
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeTypeA
,
half_t
>
&&
is_same_v
<
ComputeTypeB
,
half_t
>
)
{
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeA
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeB
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instances
(
op_ptrs
);
...
...
@@ -685,7 +693,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeTypeA
,
float
>
&&
is_same_v
<
ComputeTypeB
,
float
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instances
(
...
...
@@ -697,7 +706,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeTypeA
,
half_t
>
&&
is_same_v
<
ComputeTypeB
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instances
(
...
...
@@ -709,7 +719,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeA
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeB
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances
(
...
...
@@ -725,7 +737,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeTypeA
,
float
>
&&
is_same_v
<
ComputeTypeB
,
float
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instances
(
...
...
@@ -737,7 +750,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeTypeA
,
half_t
>
&&
is_same_v
<
ComputeTypeB
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instances
(
...
...
@@ -749,7 +763,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeA
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeB
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances
(
...
...
@@ -768,7 +784,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeTypeA
,
float
>
&&
is_same_v
<
ComputeTypeB
,
float
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instances
(
...
...
@@ -780,7 +797,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeTypeA
,
half_t
>
&&
is_same_v
<
ComputeTypeB
,
half_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instances
(
...
...
@@ -796,7 +814,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeA
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeB
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances
(
...
...
@@ -808,7 +828,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeTypeA
,
int8_t
>
&&
is_same_v
<
ComputeTypeB
,
int8_t
>
)
{
add_device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instances
(
op_ptrs
);
...
...
@@ -822,7 +844,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeTypeA
,
float
>
&&
is_same_v
<
ComputeTypeB
,
float
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
...
...
@@ -851,7 +874,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeA
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeTypeB
,
ck
::
bhalf_t
>
)
{
#ifdef DL_KERNELS
add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances
(
...
...
@@ -863,7 +888,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeTypeA
,
int8_t
>
&&
is_same_v
<
ComputeTypeB
,
int8_t
>
)
{
add_device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances
(
op_ptrs
);
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
View file @
9684677a
...
...
@@ -928,28 +928,29 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeType
,
float
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeType
,
bhalf_t
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeType
,
int8_t
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances
(
op_ptrs
);
}
...
...
@@ -961,7 +962,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeType
,
float
>
)
{
add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances
(
op_ptrs
);
}
...
...
@@ -969,7 +970,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeType
,
float
>
)
{
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances
(
op_ptrs
);
}
...
...
@@ -977,7 +978,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances
(
op_ptrs
);
...
...
@@ -989,7 +990,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances
(
op_ptrs
);
}
...
...
@@ -997,7 +998,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeType
,
bhalf_t
>
)
{
add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances
(
op_ptrs
);
}
...
...
@@ -1005,7 +1007,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeType
,
int8_t
>
)
{
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instances
(
op_ptrs
);
...
...
@@ -1021,7 +1023,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeType
,
float
>
)
{
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances
(
op_ptrs
);
}
...
...
@@ -1029,7 +1031,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS))
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeType
,
float
>
)
{
add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances
(
op_ptrs
);
}
...
...
@@ -1037,7 +1039,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances
(
op_ptrs
);
}
...
...
@@ -1045,7 +1047,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#if(defined(CK_ENABLE_FP16) && defined(DL_KERNELS))
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances
(
op_ptrs
);
}
...
...
@@ -1053,14 +1055,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeType
,
bhalf_t
>
)
{
add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeType
,
int8_t
>
)
{
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instances
(
op_ptrs
);
add_device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instances
(
op_ptrs
);
...
...
@@ -1075,14 +1078,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeType
,
float
>
)
{
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
)
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instances
(
op_ptrs
);
...
...
@@ -1095,14 +1098,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeType
,
bhalf_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeType
,
int8_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instances
(
op_ptrs
);
...
...
@@ -1119,7 +1123,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
is_same_v
<
OutDataType
,
float
>
&&
is_same_v
<
ComputeType
,
float
>
)
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
op_ptrs
);
}
...
...
@@ -1148,14 +1152,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
ComputeType
,
bhalf_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
is_same_v
<
OutDataType
,
int8_t
>
&&
is_same_v
<
ComputeType
,
int8_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances
(
op_ptrs
);
add_device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instances
(
op_ptrs
);
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp
0 → 100644
View file @
9684677a
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ScaleAddScaleAddRelu
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
#ifdef CK_ENABLE_BF16
// grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK
void
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
ck
::
Tuple
<
NDHWGK
,
NDHWGK
>
,
NDHWGK
,
BF16
,
BF16
,
ck
::
Tuple
<
BF16
,
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP16
void
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
ck
::
Tuple
<
NDHWGK
,
NDHWGK
>
,
NDHWGK
,
F16
,
F16
,
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>>>&
instances
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
ck
::
Tuple
<
NDHWGK
,
NDHWGK
>
,
NDHWGK
,
F32
,
F32
,
ck
::
Tuple
<
F32
,
F32
>
,
F32
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>>>&
instances
);
#endif
#ifdef CK_ENABLE_INT8
void
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleD
<
3
,
NDHWGC
,
GKZYXC
,
ck
::
Tuple
<
NDHWGK
,
NDHWGK
>
,
NDHWGK
,
int8_t
,
int8_t
,
ck
::
Tuple
<
F32
,
F32
>
,
int8_t
,
PassThrough
,
PassThrough
,
ScaleAddScaleAddRelu
>>>&
instances
);
#endif
template
<
ck
::
index_t
NumDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
DLayouts
,
typename
OutLayout
,
typename
InDataType
,
typename
WeiDataType
,
typename
DDataTypes
,
typename
OutDataType
,
typename
ComputeType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
DLayouts
,
OutLayout
,
InDataType
,
WeiDataType
,
DDataTypes
,
OutDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
,
ComputeType
>>
{
using
DeviceOp
=
DeviceGroupedConvFwdMultipleD
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
DLayouts
,
OutLayout
,
InDataType
,
WeiDataType
,
DDataTypes
,
OutDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
,
ComputeType
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
NumDimSpatial
==
3
&&
is_same_v
<
InLayout
,
NDHWGC
>
&&
is_same_v
<
WeiLayout
,
GKZYXC
>
&&
is_same_v
<
OutLayout
,
NDHWGK
>
)
{
#ifdef CK_ENABLE_FP32
if
constexpr
(
is_same_v
<
InDataType
,
float
>
&&
is_same_v
<
WeiDataType
,
float
>
&&
is_same_v
<
OutDataType
,
float
>
)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f32_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
InDataType
,
half_t
>
&&
is_same_v
<
WeiDataType
,
half_t
>
&&
is_same_v
<
OutDataType
,
half_t
>
&&
is_same_v
<
ComputeType
,
half_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_f16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_BF16
if
constexpr
(
is_same_v
<
InDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
WeiDataType
,
ck
::
bhalf_t
>
&&
is_same_v
<
OutDataType
,
ck
::
bhalf_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_bf16_instances
(
op_ptrs
);
}
#endif
#ifdef CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
InDataType
,
int8_t
>
&&
is_same_v
<
WeiDataType
,
int8_t
>
&&
is_same_v
<
OutDataType
,
int8_t
>
)
{
add_device_grouped_conv3d_fwd_xdl_scaleadd_scaleadd_relu_ndhwgc_gkzyxc_ndhwgk_int8_instances
(
op_ptrs
);
}
#endif
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
View file @
9684677a
...
...
@@ -120,6 +120,32 @@ void add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemm
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F8
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemm
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F8
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
template
<
typename
ALayout
,
typename
BLayout
,
typename
ELayout
,
...
...
@@ -184,6 +210,24 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
f8_t
>
&&
is_same_v
<
EDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
ADataType
,
f8_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
EDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
...
...
library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt
View file @
9684677a
add_instance_library
(
device_column_to_image_instance
device_column_to_image_nhwc_1d_instance.cpp
device_column_to_image_nhwc_2d_instance.cpp
device_column_to_image_nhwc_3d_instance.cpp
device_column_to_image_gnwc_1d_instance.cpp
device_column_to_image_gnhwc_2d_instance.cpp
device_column_to_image_gndhwc_3d_instance.cpp
device_column_to_image_nwgc_1d_instance.cpp
device_column_to_image_nhwgc_2d_instance.cpp
device_column_to_image_ndhwgc_3d_instance.cpp
)
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
n
hwc_3d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
gnd
hwc_3d_instance.cpp
View file @
9684677a
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_ndhwc_3d_bf16_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances(
#endif
}
void
add_device_column_to_image_ndhwc_3d_f16_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances(
#endif
}
void
add_device_column_to_image_ndhwc_3d_f32_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances(
#endif
}
void
add_device_column_to_image_ndhwc_3d_i8_instances
(
void
add_device_column_to_image_
g
ndhwc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
GNDHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
g
nhwc_2d_instance.cpp
View file @
9684677a
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nhwc_2d_bf16_instances
(
void
add_device_column_to_image_
g
nhwc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances(
#endif
}
void
add_device_column_to_image_nhwc_2d_f16_instances
(
void
add_device_column_to_image_
g
nhwc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances(
#endif
}
void
add_device_column_to_image_nhwc_2d_f32_instances
(
void
add_device_column_to_image_
g
nhwc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances(
#endif
}
void
add_device_column_to_image_nhwc_2d_i8_instances
(
void
add_device_column_to_image_
g
nhwc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
GNHWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_n
h
wc_1d_instance.cpp
→
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_
g
nwc_1d_instance.cpp
View file @
9684677a
...
...
@@ -11,7 +11,7 @@ namespace instance {
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nwc_1d_bf16_instances
(
void
add_device_column_to_image_
g
nwc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances(
#endif
}
void
add_device_column_to_image_nwc_1d_f16_instances
(
void
add_device_column_to_image_
g
nwc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances(
#endif
}
void
add_device_column_to_image_nwc_1d_f32_instances
(
void
add_device_column_to_image_
g
nwc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
...
...
@@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances(
#endif
}
void
add_device_column_to_image_nwc_1d_i8_instances
(
void
add_device_column_to_image_
g
nwc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
GNWC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
...
...
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp
0 → 100644
View file @
9684677a
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_ndhwgc_3d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances
(
instances
,
device_column_to_image_bf16_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances
(
instances
,
device_column_to_image_f16_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances
(
instances
,
device_column_to_image_f32_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_ndhwgc_3d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
3
,
NDHWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances
(
instances
,
device_column_to_image_i8_instances
<
3
,
NDHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwgc_2d_instance.cpp
0 → 100644
View file @
9684677a
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nhwgc_2d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances
(
instances
,
device_column_to_image_bf16_instances
<
2
,
NHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nhwgc_2d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances
(
instances
,
device_column_to_image_f16_instances
<
2
,
NHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nhwgc_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances
(
instances
,
device_column_to_image_f32_instances
<
2
,
NHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nhwgc_2d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
2
,
NHWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances
(
instances
,
device_column_to_image_i8_instances
<
2
,
NHWGC
>
{});
#else
ignore
=
instances
;
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nwgc_1d_instance.cpp
0 → 100644
View file @
9684677a
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
namespace
ck
::
conv_tensor_rearrange_op
;
void
add_device_column_to_image_nwgc_1d_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
BF16
,
BF16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_BF16
add_device_operation_instances
(
instances
,
device_column_to_image_bf16_instances
<
1
,
NWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nwgc_1d_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
F16
,
F16
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP16
add_device_operation_instances
(
instances
,
device_column_to_image_f16_instances
<
1
,
NWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nwgc_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
F32
,
F32
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_FP32
add_device_operation_instances
(
instances
,
device_column_to_image_f32_instances
<
1
,
NWGC
>
{});
#else
ignore
=
instances
;
#endif
}
void
add_device_column_to_image_nwgc_1d_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceConvTensorRearrange
<
1
,
NWGC
,
int8_t
,
int8_t
,
ColumnToImage
>>>&
instances
)
{
#ifdef CK_ENABLE_INT8
add_device_operation_instances
(
instances
,
device_column_to_image_i8_instances
<
1
,
NWGC
>
{});
#else
ignore
=
instances
;
#endif
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
View file @
9684677a
set
(
DEVICE_CONTRACTION_BILINEAR_INSTANCES
)
#float
# FP32
list
(
APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
)
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
)
list
(
APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
)
list
(
APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
)
#
double
#
FP64
list
(
APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
)
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
)
list
(
APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
)
# FP16
list
(
APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
)
# BF16
list
(
APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
)
add_instance_library
(
device_contraction_bilinear_instance
${
DEVICE_CONTRACTION_BILINEAR_INSTANCES
}
)
library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
0 → 100644
View file @
9684677a
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
=
device_contraction_kk_instance
<
BF16
,
BF16
,
F32
,
BF16
,
BF16_Tuple
,
BF16
,
F32
,
PassThrough
,
PassThrough
,
Bilinear
>
;
void
add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
2
,
2
,
2
,
BF16
,
BF16
,
BF16_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Bilinear
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
4
5
6
7
8
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment