// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "device_grouped_conv2d_fwd_common.hpp"

namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {

template <typename ALayout,
          typename BLayout,
          typename DsLayout,
          typename ELayout,
          typename DsDatatype,
          typename CDEElementOp,
          ConvolutionForwardSpecialization ConvSpec>
using device_grouped_conv2d_fwd_xdl_f16_instances =
    std::tuple<
        // clang-format off
        //########################################|  NumDim|       A|       B|       Ds|       E| AData| BData| AccData| CShuffle|         Ds| EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //########################################| Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|    Type| DataType|   DataType|  Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
        //########################################|        |        |        |         |        |      |      |        |         |           |      |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //########################################|        |        |        |         |        |      |      |        |         |           |      |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F16,   F16,     F32,      F16, DsDatatype,   F16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
        // clang-format on
        >;

template <typename ALayout,
          typename BLayout,
          typename DsLayout,
          typename ELayout,
          typename DsDatatype,
          typename CDEElementOp,
          ConvolutionForwardSpecialization ConvSpec>
using device_grouped_conv2d_fwd_xdl_bf16_instances =
    std::tuple<
        // clang-format off
        //########################################|  NumDim|       A|       B|       Ds|       E| AData| BData| AccData| CShuffle|         Ds| EData|           A|           B|          CDE|    ConvForward|            GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //########################################| Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|    Type| DataType|   DataType|  Type| Elementwise| Elementwise|  Elementwise| Specialization|  Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
        //########################################|        |        |        |         |        |      |      |        |         |           |      |   Operation|   Operation|    Operation|               |                |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //########################################|        |        |        |         |        |      |      |        |         |           |      |            |            |             |               |                |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,  BF16,  BF16,     F32,     BF16, DsDatatype,  BF16, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
        // clang-format on
        >;

template <typename ALayout,
          typename BLayout,
          typename DsLayout,
          typename ELayout,
          typename DsDatatype,
          typename CDEElementOp,
          ConvolutionForwardSpecialization ConvSpec>
using device_grouped_conv2d_fwd_xdl_f32_instances =
    std::tuple<
        // clang-format off
        //########################################|  NumDim|       A|       B|       Ds|       E| AData| BData| AccData| CShuffle|         Ds| EData|           A|           B|          CDE|    ConvForward|            GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //########################################| Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|    Type| DataType|   DataType|  Type| Elementwise| Elementwise|  Elementwise| Specialization|  Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
        //########################################|        |        |        |         |        |      |      |        |         |           |      |   Operation|   Operation|    Operation|               |                |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //########################################|        |        |        |         |        |      |      |        |         |           |      |            |            |             |               |                |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2, ALayout, BLayout, DsLayout, ELayout,   F32,   F32,     F32,      F32, DsDatatype,   F32, PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
        // clang-format on
        >;

} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
