Merge branch 'develop' into amd-develop

a8fafc3f · Jun Liu · 4939ee59 · 844b215d · a8fafc3f · a8fafc3f
Commit a8fafc3f authored Jul 21, 2023 by Jun Liu
20 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

-#include <cstdlib>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"

+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F32 = float;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNHWC = ck::tensor_layout::convolution::GNHWC;
-using GKYXC = ck::tensor_layout::convolution::GKYXC;
-using GNHWK = ck::tensor_layout::convolution::GNHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvBwdWeightDefault =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
-
-static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
-
-// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
-using device_grouped_conv2d_bwd_weight_xdl_c_shuffle_gnhwc_gkyxc_gnhwk_f32_default_instances =
-    std::tuple<
-        // clang-format off
-        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
-        // clang-format on
-        >;
-
-using device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_1x1_s1_p0_f32_instances = std::tuple<
-    // clang-format off
-        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
-    // clang-format on
-    >;
-
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
 void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           GNHWC,
@@ -90,12 +23,22 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                           PassThrough,
                                                           PassThrough>>>& instances)
 {
+    // 1. Default
    add_device_operation_instances(
        instances,
-        device_grouped_conv2d_bwd_weight_xdl_c_shuffle_gnhwc_gkyxc_gnhwk_f32_default_instances{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_1x1_s1_p0_f32_instances{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
+                                                                   GNHWC,
+                                                                   GKYXC,
+                                                                   GNHWK,
+                                                                   ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
+                                       2,
+                                       GNHWC,
+                                       GKYXC,
+                                       GNHWK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
+                                                                    NHWGC,
+                                                                    GKYXC,
+                                                                    NHWGK,
+                                                                    ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<2,
+                                                                   NHWGC,
+                                                                   GKYXC,
+                                                                   NHWGK,
+                                                                   ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
+                                                                   NHWGC,
+                                                                   GKYXC,
+                                                                   NHWGK,
+                                                                   ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+add_instance_library(device_grouped_conv3d_bwd_data_instance
+   device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+   device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+   device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+   device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+   device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+   device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+)
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+// wo, k]
+void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  GNDHWK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  GNDHWC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
+                                                        GNDHWK,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        GNDHWC,
+                                                        ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
+                                                        GNDHWK,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        GNDHWC,
+                                                        ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+// wo, k]
+void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  GNDHWK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  GNDHWC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_instances<3,
+                                                       GNDHWK,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       GNDHWC,
+                                                       ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_instances<3,
+                                                       GNDHWK,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       GNDHWC,
+                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = in[g, n, do, ho,
+// wo, k]
+void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  GNDHWK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  GNDHWC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_instances<3,
+                                                       GNDHWK,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       GNDHWC,
+                                                       ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_instances<3,
+                                                       GNDHWK,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       GNDHWC,
+                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  BF16,
+                                                                  BF16,
+                                                                  Empty_Tuple,
+                                                                  BF16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
+                                                        NDHWGK,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGC,
+                                                        ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_bf16_instances<3,
+                                                        NDHWGK,
+                                                        GKZYXC,
+                                                        Empty_Tuple,
+                                                        NDHWGC,
+                                                        ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_instances<3,
+                                                       NDHWGK,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGC,
+                                                       ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f16_instances<3,
+                                                       NDHWGK,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGC,
+                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/device_grouped_conv3d_bwd_data_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for out[n, di, hi, wi, g, c] * wei[g, k, z, y, x, c] = in[n, do, ho, wo,
+// g, k]
+void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<3,
+                                                                  NDHWGK,
+                                                                  GKZYXC,
+                                                                  Empty_Tuple,
+                                                                  NDHWGC,
+                                                                  F32,
+                                                                  F32,
+                                                                  Empty_Tuple,
+                                                                  F32,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_instances<3,
+                                                       NDHWGK,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGC,
+                                                       ConvBwdDataDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_data_xdl_f32_instances<3,
+                                                       NDHWGK,
+                                                       GKZYXC,
+                                                       Empty_Tuple,
+                                                       NDHWGC,
+                                                       ConvBwdDataFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -2,4 +2,7 @@ add_instance_library(device_grouped_conv3d_bwd_weight_instance
    device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
    device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
    device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+    device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+    device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+    device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
 )
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -5,81 +5,17 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using BF16 = bhalf_t;
-using F32  = float;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
-using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
-using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvBwdWeightDefault =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
-
-static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
-
-// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
-using device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances =
-    std::tuple<
-        // clang-format off
-        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |       |        |        |        |            |            |            |                                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                 |                |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
-        // clang-format on
-        >;
-
-using device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_bf16_f32_bf16_instances =
-    std::tuple<
-        // clang-format off
-        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |       |        |        |        |            |            |            |                                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                 |                |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           GNDHWC,
@@ -92,12 +28,22 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16
                                                           PassThrough,
                                                           PassThrough>>>& instances)
 {
+    // 1. Default
    add_device_operation_instances(
        instances,
-        device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_bf16_f32_bf16_instances{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
+                                                                    GNDHWC,
+                                                                    GKZYXC,
+                                                                    GNDHWK,
+                                                                    ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
+                                       3,
+                                       GNDHWC,
+                                       GKZYXC,
+                                       GNDHWK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -5,81 +5,17 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F16 = ck::half_t;
-using F32 = float;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
-using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
-using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvBwdWeightDefault =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
-
-static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
-
-// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
-using device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_f16_default_instances =
-    std::tuple<
-        // clang-format off
-        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
-        // clang-format on
-        >;
-
-using device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_f16_instances =
-    std::tuple<
-        // clang-format off
-        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           GNDHWC,
@@ -92,12 +28,22 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
                                                           PassThrough,
                                                           PassThrough>>>& instances)
 {
+    // 1. Default
    add_device_operation_instances(
        instances,
-        device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_f16_default_instances{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_f16_instances{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
+                                                                   GNDHWC,
+                                                                   GKZYXC,
+                                                                   GNDHWK,
+                                                                   ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
+                                       3,
+                                       GNDHWC,
+                                       GKZYXC,
+                                       GNDHWK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -5,80 +5,17 @@

 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using F32 = float;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
-using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
-using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
-
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-static constexpr auto ConvBwdWeightDefault =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
-
-static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
-    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
-
-// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
-using device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_f32_default_instances =
-    std::tuple<
-        // clang-format off
-        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
-        // clang-format on
-        >;
-
-using device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_f32_instances =
-    std::tuple<
-        // clang-format off
-        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
-        // clang-format on
-        >;
-
 void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           GNDHWC,
@@ -91,12 +28,22 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
                                                           PassThrough,
                                                           PassThrough>>>& instances)
 {
+    // 1. Default
    add_device_operation_instances(
        instances,
-        device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_f32_default_instances{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_f32_instances{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
+                                                                   GNDHWC,
+                                                                   GKZYXC,
+                                                                   GNDHWK,
+                                                                   ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
+                                       3,
+                                       GNDHWC,
+                                       GKZYXC,
+                                       GNDHWK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
 }

 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
+                                                                   NDHWGC,
+                                                                   GKZYXC,
+                                                                   NDHWGK,
+                                                                   ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
+                                                                   NDHWGC,
+                                                                   GKZYXC,
+                                                                   NDHWGK,
+                                                                   ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
+if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
 set(CONV2D_PERLAYER_QUANT_SRC
    conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
    conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
@@ -36,3 +37,4 @@ add_instance_library(device_quantization_instance
    ${CONV2D_BIAS_PERCHANNEL_QUANT_SRC}
    ${GEMM_QUANT_SRC}
 )
+endif()
\ No newline at end of file
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -141,3 +141,46 @@ avg_time: 0.768321
 tflops: 86.6679
 GB/s: 127.947
 ```
+
+## Profile grouped convolution backward weight kernels
+```bash
+# arg1: tensor operation (grouped_conv_bwd_weight: Grouped Convolution Backward Weight)
+# arg2: data type (0: Input fp32, Weight fp32, Output fp32
+#                  1: Input fp16, Weight fp16, Output fp16
+#                  2: Input bf16, Weight fp32, Output bf16)
+# arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, N, K, Ho, Wo]
+#                      1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]
+#                      2: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]
+# arg4: verification (0: no, 1: yes)
+# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
+# arg6: print tensor value (0: no; 1: yes)
+# arg7: time kernel (0: no, 1: yes)
+# Following arguments (depending on number of spatial dims):
+#  Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
+#  G, N, K, C, 
+#  <filter spatial dimensions>, (ie Y, X for 2D)
+#  <input image spatial dimensions>, (ie Hi, Wi for 2D)
+#  <strides>, (ie Sy, Sx for 2D)
+#  <dilations>, (ie Dy, Dx for 2D)
+#  <left padding>, (ie LeftPy, LeftPx for 2D)
+#  <right padding>, (ie RightPy, RightPx for 2D)
+# SplitK
+
+ ################                   op   datatype  layout  verify  init  log  time  Ndims  G   N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx  SplitK
+./bin/ckProfiler grouped_conv_bwd_weight          1       0       1     1    0     1      2 32 256 256 512  3  3  28  28   1   1   1   1       1       0        0        0       1
+
+ ```
+
+Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
+```
+input: dim 5, lengths {32, 512, 1024, 28, 28}, strides {411041792, 802816, 1, 28672, 1024}
+weight: dim 5, lengths {32, 512, 1024, 3, 3}, strides {4718592, 9216, 1, 3072, 1024}
+output: dim 5, lengths {32, 512, 512, 26, 26}, strides {177209344, 346112, 1, 13312, 512}
+....
+Best configuration parameters:
+name: DeviceGroupedConvBwdWeight_Xdl_CShuffle<256, 256, 128, 4, Default, 8, 4, 2, 8, 4, 8, 2, 1, 1, 8>
+avg_time: 68.5216
+tflops: 95.337
+GB/s: 69.2301
+```
+Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.