Apply cshuffle to bwd_weight_cshuffle operator

ebb5522c · Mateusz Ozga · root · fdfe2102 · ebb5522c · ebb5522c
Commit ebb5522c authored Nov 12, 2024 by Mateusz Ozga Committed by root Dec 16, 2024
20 changed files
--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -39,33 +39,33 @@ using DeviceConvBwdWeightInstance =
        WeiElementOp,         // WeiElementwiseOperation
        OutElementOp,         // OutElementwiseOperation
        ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
-        256,                  // BlockSize
+        64,                   // BlockSize
-        128,                  // MPerBlock
+        16,                   // MPerBlock
-        128,                  // NPerBlock
+        16,                   // NPerBlock
-        4,                    // K0PerBlock
+        32,                   // K0PerBlock
        8,                    // K1
-        32,                   // MPerXdl
+        16,                   // MPerXdl
-        32,                   // NPerXdl
+        16,                   // NPerXdl
-        2,                    // MXdlPerWave
+        1,                    // MXdlPerWave
-        2,                    // NXdlPerWave
+        1,                    // NXdlPerWave
-        S<1, 4, 16, 4>,       // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<4, 16, 1>,          // ABlockTransferThreadClusterLengths_K0_M_K1
-        S<0, 3, 1, 2>,        // ABlockTransferThreadClusterArrangeOrder
+        S<2, 0, 1>,           // ABlockTransferThreadClusterArrangeOrder
-        S<0, 2, 1, 3>,        // ABlockTransferSrcAccessOrder
+        S<1, 0, 2>,           // ABlockTransferSrcAccessOrder
-        2,                    // ABlockTransferSrcVectorDim
+        1,                    // ABlockTransferSrcVectorDim
-        8,                    // ABlockTransferSrcScalarPerVector
+        1,                    // ABlockTransferSrcScalarPerVector
-        2,                    // ABlockTransferDstScalarPerVector_K1
+        4,                    // ABlockTransferDstScalarPerVector_K1
-        true,                 // ABlockLdsAddExtraM
+        false,                // ABlockLdsAddExtraM
-        S<1, 4, 16, 4>,       // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<4, 16, 1>,          // BBlockTransferThreadClusterLengths_K0_N_K1
-        S<0, 3, 1, 2>,        // BBlockTransferThreadClusterArrangeOrder
+        S<2, 0, 1>,           // BBlockTransferThreadClusterArrangeOrder
-        S<0, 2, 1, 3>,        // BBlockTransferSrcAccessOrder
+        S<1, 0, 2>,           // BBlockTransferSrcAccessOrder
-        2,                    // BBlockTransferSrcVectorDim
+        1,                    // BBlockTransferSrcVectorDim
-        8,                    // BBlockTransferSrcScalarPerVector
+        1,                    // BBlockTransferSrcScalarPerVector
-        2,                    // BBlockTransferDstScalarPerVector_K1
+        4,                    // BBlockTransferDstScalarPerVector_K1
-        true,                 // BBlockLdsAddExtraN
+        false,                // BBlockLdsAddExtraN
        1,                    // CShuffleMXdlPerWavePerShuffle
        1,                    // CShuffleNXdlPerWavePerShuffle
-        S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        S<1, 8, 1, 8>,        // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-        128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
+        1>;                   // CBlockTransferScalarPerVector_NWaveNPerXdl
 template <ck::index_t NDimSpatial>
 using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,

--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
@@ -41,26 +41,26 @@ using DeviceConvBwdWeightInstance =
        256,                  // BlockSize
        128,                  // MPerBlock
        128,                  // NPerBlock
-        4,                    // K0PerBlock
+        32,                   // K0PerBlock
        8,                    // K1
        32,                   // MPerXdl
        32,                   // NPerXdl
        2,                    // MXdlPerWave
        2,                    // NXdlPerWave
-        S<1, 4, 16, 4>,       // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<4, 16, 1>,          // ABlockTransferThreadClusterLengths_K0_M_K1
-        S<0, 3, 1, 2>,        // ABlockTransferThreadClusterArrangeOrder
+        S<2, 0, 1>,           // ABlockTransferThreadClusterArrangeOrder
-        S<0, 2, 1, 3>,        // ABlockTransferSrcAccessOrder
+        S<1, 0, 2>,           // ABlockTransferSrcAccessOrder
        2,                    // ABlockTransferSrcVectorDim
        8,                    // ABlockTransferSrcScalarPerVector
        2,                    // ABlockTransferDstScalarPerVector_K1
-        true,                 // ABlockLdsAddExtraM
+        false,                // ABlockLdsAddExtraM
-        S<1, 4, 16, 4>,       // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<4, 16, 1>,          // ABlockTransferThreadClusterLengths_K0_M_K1
-        S<0, 3, 1, 2>,        // BBlockTransferThreadClusterArrangeOrder
+        S<2, 0, 1>,           // ABlockTransferThreadClusterArrangeOrder
-        S<0, 2, 1, 3>,        // BBlockTransferSrcAccessOrder
+        S<1, 0, 2>,           // ABlockTransferSrcAccessOrder
        2,                    // BBlockTransferSrcVectorDim
        8,                    // BBlockTransferSrcScalarPerVector
        2,                    // BBlockTransferDstScalarPerVector_K1
-        true,                 // BBlockLdsAddExtraN
+        false,                // BBlockLdsAddExtraN
        1,                    // CShuffleMXdlPerWavePerShuffle
        1,                    // CShuffleNXdlPerWavePerShuffle
        S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock

--- a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
@@ -40,33 +40,36 @@ using DeviceConvBwdWeightInstance =
        WeiElementOp,         // WeiElementwiseOperation
        OutElementOp,         // OutElementwiseOperation
        ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
-        256,                  // BlockSize
+        64,                   // BlockSize
-        128,                  // MPerBlock
+        16,                   // MPerBlock
-        128,                  // NPerBlock
+        16,                   // NPerBlock
-        4,                    // K0PerBlock
+        32,                   // K0PerBlock
        8,                    // K1
-        32,                   // MPerXdl
+        16,                   // MPerXdl
-        32,                   // NPerXdl
+        16,                   // NPerXdl
-        2,                    // MXdlPerWave
+        1,                    // MXdlPerWave
-        2,                    // NXdlPerWave
+        1,                    // NXdlPerWave
-        S<1, 4, 16, 4>,       // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<4, 16, 1>,          // ABlockTransferThreadClusterLengths_K0_M_K1
-        S<0, 3, 1, 2>,        // ABlockTransferThreadClusterArrangeOrder
+        S<2, 0, 1>,           // ABlockTransferThreadClusterArrangeOrder
-        S<0, 2, 1, 3>,        // ABlockTransferSrcAccessOrder
+        S<1, 0, 2>,           // ABlockTransferSrcAccessOrder
-        2,                    // ABlockTransferSrcVectorDim
+        1,                    // ABlockTransferSrcVectorDim
        1,                    // ABlockTransferSrcScalarPerVector
-        1,                    // ABlockTransferDstScalarPerVector_K1
+        4,                    // ABlockTransferDstScalarPerVector_K1
-        true,                 // ABlockLdsAddExtraM
+        false,                // ABlockLdsAddExtraM
-        S<1, 4, 16, 4>,       // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<4, 16, 1>,          // BBlockTransferThreadClusterLengths_K0_N_K1
-        S<0, 3, 1, 2>,        // BBlockTransferThreadClusterArrangeOrder
+        S<2, 0, 1>,           // BBlockTransferThreadClusterArrangeOrder
-        S<0, 2, 1, 3>,        // BBlockTransferSrcAccessOrder
+        S<1, 0, 2>,           // BBlockTransferSrcAccessOrder
-        2,                    // BBlockTransferSrcVectorDim
+        1,                    // BBlockTransferSrcVectorDim
        1,                    // BBlockTransferSrcScalarPerVector
-        1,                    // BBlockTransferDstScalarPerVector_K1
+        4,                    // BBlockTransferDstScalarPerVector_K1
-        true,                 // BBlockLdsAddExtraN
+        false,                // BBlockLdsAddExtraN
        1,                    // CShuffleMXdlPerWavePerShuffle
        1,                    // CShuffleNXdlPerWavePerShuffle
-        S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        S<1, 8, 1, 8>,        // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
        2,                    // CBlockTransferScalarPerVector_NWaveNPerXdl
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        1,                                         // NumGroupsToMerge
        ComputeTypeA,                              // ComputeTypeA
        ComputeTypeB>;                             // ComputeTypeB

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -8,93 +8,143 @@
 #include <sstream>
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp"
+#include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace device {
 template <typename GridwiseGemm,
-          typename FloatA,
+          typename AGridDesc_AK0_M_K1,
-          typename FloatB,
+          typename BGridDesc_BK0_N_K1,
-          typename FloatC,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename AGridDesc_B_K0_M_K1,
-          typename BGridDesc_B_K0_N_K1,
          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-          typename Block2CTileMap,
          typename ComputePtrOffsetOfBatch,
-          bool HasMainKBlockLoop>
+          index_t NumGroupsToMerge,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
 #endif
-        kernel_batched_gemm_xdlops_bwd_weight(
+        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3(
-            const FloatA* __restrict__ p_a_grid,
+            typename GridwiseGemm::Argument karg,
-            const FloatB* __restrict__ p_b_grid,
+            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
-            FloatC* __restrict__ p_c_grid,
+            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CElementwiseOperation c_element_op,
-            const index_t batch_count,
-            const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
-            const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                c_grid_desc_mblock_mperblock_nblock_nperblock,
-            const Block2CTileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+            const index_t num_k_per_block)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
    defined(__gfx94__))
-    const index_t num_blocks_per_batch =
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+    const long_index_t a_batch_offset =
-    const long_index_t a_batch_offset = compute_ptr_offset_of_batch.GetAPtrOffset(g_idx);
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
-    const long_index_t b_batch_offset = compute_ptr_offset_of_batch.GetBPtrOffset(g_idx);
+    const long_index_t b_batch_offset =
-    const long_index_t c_batch_offset = compute_ptr_offset_of_batch.GetCPtrOffset(g_idx);
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
-    __shared__ FloatA p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatA)];
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-                                                  p_b_grid + b_batch_offset,
+    GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
-                                                  p_c_grid + c_batch_offset,
+                               BGridDesc_BK0_N_K1,
+                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                               HasMainKBlockLoop,
+                               CGlobalMemoryDataOperation,
+                               TailNum>(karg.p_a_grid + a_batch_offset,
+                                        karg.p_b_grid + b_batch_offset,
+                                        karg.p_c_grid + e_batch_offset,
                                        p_shared,
-                                                  a_b_k0_m_k1_grid_desc,
+                                        karg,
-                                                  b_b_k0_n_k1_grid_desc,
+                                        a_grid_desc_ak0_m_ak1,
+                                        b_grid_desc_bk0_n_bk1,
                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  a_element_op,
+                                        k_idx);
-                                                  b_element_op,
-                                                  c_element_op,
-                                                  block_2_ctile_map);
 #else
-    ignore = p_a_grid;
+    ignore = karg;
-    ignore = p_b_grid;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
-    ignore = p_c_grid;
+}
-    ignore = a_b_k0_m_k1_grid_desc;
-    ignore = b_b_k0_n_k1_grid_desc;
+template <typename GridwiseGemm,
-    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+          typename AGridDesc_AK0_M_K1,
-    ignore = a_element_op;
+          typename BGridDesc_BK0_N_K1,
-    ignore = b_element_op;
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-    ignore = c_element_op;
+          typename ComputePtrOffsetOfBatch,
-    ignore = batch_count;
+          index_t NumGroupsToMerge,
-    ignore = block_2_ctile_map;
+          bool HasMainKBlockLoop,
-    ignore = compute_ptr_offset_of_batch;
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
-    compute_ptr_offset_of_batch.GetAPtrOffset(0);
+          TailNumber TailNum       = TailNumber::Full>
-    compute_ptr_offset_of_batch.GetBPtrOffset(0);
+__global__ void
-    compute_ptr_offset_of_batch.GetCPtrOffset(0);
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds(
+            typename GridwiseGemm::Argument karg,
+            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const index_t num_k_per_block)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+    // offset base pointer for each work-group
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
+    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
+                                    BGridDesc_BK0_N_K1,
+                                    CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    HasMainKBlockLoop,
+                                    CGlobalMemoryDataOperation,
+                                    TailNum>(karg.p_a_grid + a_batch_offset,
+                                             karg.p_b_grid + b_batch_offset,
+                                             karg.p_c_grid + e_batch_offset,
+                                             p_shared_0,
+                                             p_shared_1,
+                                             karg,
+                                             a_grid_desc_ak0_m_ak1,
+                                             b_grid_desc_bk0_n_bk1,
+                                             c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                             k_idx);
+#else
+    ignore = karg;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
@@ -138,6 +188,9 @@ template <ck::index_t NDimSpatial,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          index_t NumGroupsToMerge                    = 1,
          typename ComputeTypeA                       = InDataType,
          typename ComputeTypeB                       = ComputeTypeA>
 struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
@@ -154,6 +207,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                        ComputeTypeA,
                                        ComputeTypeB>
 {
+    static_assert(is_same_v<InElementwiseOperation, element_wise::PassThrough>);
+    static_assert(is_same_v<WeiElementwiseOperation, element_wise::PassThrough>);
+    static_assert(is_same_v<OutElementwiseOperation, element_wise::PassThrough>);
    using DeviceOp = DeviceGroupedConvBwdWeight_Xdl_CShuffle;
    using ADataType = OutDataType;
@@ -174,30 +231,18 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
    static constexpr auto I4 = Number<4>{};
    static constexpr auto I5 = Number<5>{};
+    static constexpr GemmSpecialization GemmSpec = GemmSpecialization::Default;
    static constexpr auto K1Number               = Number<K1>{};
    static constexpr auto conv_to_gemm_transformer =
-        TransformConvBwdWeightToGemm<NDimSpatial,
+        TransformConvBwdWeightToGemmV2<NDimSpatial,
                                       MPerBlock,
                                       NPerBlock,
                                       K1Number,
-                                     K0PerBlock,
+                                       K0PerBlock / K1Number,
+                                       NumGroupsToMerge,
                                       ConvBackwardWeightSpecialization>{};
-    // Bytes per 32 lds bank: 32 * 4 bytes
-    static constexpr auto BankLength = 128;
-    static constexpr auto ElePerBank = BankLength / sizeof(ADataType);
-    // M1 & M0
-    static constexpr auto ABlockLdsM1PerBlock = ElePerBank / K1;
-    static constexpr auto ABlockLdsM0PerBlock = MPerBlock / ABlockLdsM1PerBlock;
-    static constexpr auto ABlockLdsM1Padding  = 4;
-    // N1 & N0
-    static constexpr auto BBlockLdsN1PerBlock = ElePerBank / K1;
-    static constexpr auto BBlockLdsN0PerBlock = NPerBlock / BBlockLdsN1PerBlock;
-    static constexpr auto BBlockLdsN1Padding  = 4;
    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
    static auto GetABCGridDesc()
    {
@@ -279,25 +324,27 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
-    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+    using GridwiseGemm =
-        BlockSize,
+        GridwiseGemm_xdl_cshuffle_v3<tensor_layout::gemm::RowMajor,
+                                     tensor_layout::gemm::ColumnMajor,
+                                     tensor_layout::gemm::RowMajor,
                                     ADataType,
                                     BDataType,
                                     AccDataType,
                                     CDataType,
-        InMemoryDataOperationEnum::AtomicAdd,
+                                     CDataType,
-        AGridDesc_K0_M_K1,
-        BGridDesc_K0_N_K1,
-        CGridDesc_M_N,
                                     AElementwiseOperation,
                                     BElementwiseOperation,
                                     CElementwiseOperation,
+                                     GemmSpec,
+                                     BlockSize,
                                     MPerBlock,
                                     NPerBlock,
                                     K0PerBlock,
+                                     K1,
+                                     K1,
                                     MPerXdl,
                                     NPerXdl,
-        K1,
                                     MXdlPerWave,
                                     NXdlPerWave,
                                     ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -306,39 +353,29 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                     ABlockTransferSrcVectorDim,
                                     ABlockTransferSrcScalarPerVector,
                                     ABlockTransferDstScalarPerVector_K1,
-        false, // AThreadTransferSrcResetCoordinateAfterRun,
+                                     false,
                                     ABlockLdsAddExtraM,
-        ABlockLdsM1PerBlock,
-        ABlockLdsM0PerBlock,
-        ABlockLdsM1Padding,
                                     BBlockTransferThreadClusterLengths_K0_N_K1,
                                     BBlockTransferThreadClusterArrangeOrder,
                                     BBlockTransferSrcAccessOrder,
                                     BBlockTransferSrcVectorDim,
                                     BBlockTransferSrcScalarPerVector,
                                     BBlockTransferDstScalarPerVector_K1,
-        false, // BThreadTransferSrcResetCoordinateAfterRun,
+                                     false,
                                     BBlockLdsAddExtraN,
-        BBlockLdsN1PerBlock,
-        BBlockLdsN0PerBlock,
-        BBlockLdsN1Padding,
                                     CShuffleMXdlPerWavePerShuffle,
                                     CShuffleNXdlPerWavePerShuffle,
-        CBlockTransferScalarPerVector_NWaveNPerXdl,
                                     CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
-        true,
+                                     CBlockTransferScalarPerVector_NWaveNPerXdl,
-        true,
+                                     BlkGemmPipeSched,
-        1,
+                                     BlkGemmPipelineVer,
-        PipelineVersion::v1,
                                     ComputeTypeA,
                                     ComputeTypeB>;
    // Argument
    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
+        decltype(GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            CGridDesc_M_N{}, 1, 1));
-    using Block2CTileMap =
-        decltype(GridwiseGemm::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
    struct Argument : public BaseArgument
    {
@@ -368,7 +405,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
              b_grid_desc_kbatch_k0_n_k1_{},
              c_grid_desc_m_n_{},
              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_ctile_map_{},
              compute_ptr_offset_of_batch_{},
              M01_{M01},
              N01_{N01},
@@ -420,9 +456,6 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
            b_grid_desc_kbatch_k0_n_k1_ = descs[I1];
            c_grid_desc_m_n_            = descs[I2];
-            block_2_ctile_map_ =
-                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
            // A/B/C Batch Stride
            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides[0];
@@ -432,15 +465,14 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                                end(filter_spatial_lengths_),
                                index_t{1},
                                std::multiplies<>{});
+            const index_t GemmM = a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+            const index_t GemmN = b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
-            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
-                                           b_grid_desc_kbatch_k0_n_k1_,
-                                           c_grid_desc_m_n_,
-                                           block_2_ctile_map_))
-            {
            c_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-            }
+                    c_grid_desc_m_n_,
+                    GridwiseGemm::CalculateMBlock(GemmM),
+                    GridwiseGemm::CalculateNBlock(GemmN));
        }
        const ADataType* p_a_grid_;
@@ -451,10 +483,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        CGridDesc_M_N c_grid_desc_m_n_;
        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
-        Block2CTileMap block_2_ctile_map_;
        // for computing batch offset
-        ComputePtrOffsetOfStridedBatch<> compute_ptr_offset_of_batch_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_batch_;
        index_t M01_;
        index_t N01_;
@@ -502,68 +532,629 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+            const index_t GemmM = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
-                                            arg.b_grid_desc_kbatch_k0_n_k1_,
+            const index_t GemmN = arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
-                                            arg.c_grid_desc_m_n_,
+            const index_t GemmK = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) *
-                                            arg.block_2_ctile_map_))
+                                  arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2);
-            {
-                throw std::runtime_error(
-                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
-            }
-            const index_t grid_size =
+            const ADataType* p_a_grid = arg.p_a_grid_;
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Conv_G_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
+            typename GridwiseGemm::Argument gemm_arg{
+                p_a_grid, p_b_grid, arg.p_c_grid_, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
-            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(
+                gemm_arg.M, gemm_arg.N, gemm_arg.KBatch, arg.Conv_G_ / NumGroupsToMerge);
-            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+            float ave_time = 0;
-            auto launch_kernel = [&](auto has_main_k_block_loop) {
+            index_t k_grain                  = gemm_arg.KBatch * K0PerBlock;
-                constexpr bool has_main_loop = has_main_k_block_loop.value;
+            index_t K_split                  = (gemm_arg.K + k_grain - 1) / k_grain * K0PerBlock;
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
-                const auto kernel = kernel_batched_gemm_xdlops_bwd_weight<
+            const auto num_k_per_block =
-                    GridwiseGemm,
+                arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(Number<0>{}) / gemm_arg.KBatch;
-                    ADataType,
-                    BDataType,
-                    CDataType,
-                    OutElementwiseOperation,
-                    InElementwiseOperation,
-                    WeiElementwiseOperation,
-                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                    remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                    remove_reference_t<DeviceOp::Block2CTileMap>,
-                    ComputePtrOffsetOfStridedBatch<>,
-                    has_main_loop>;
-                return launch_and_time_kernel(stream_config,
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
+                        gemm_arg_,
+                        stream_config.rotating_count,
+                        gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
+                        gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
+                    rotating_mem.Print();
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                    };
+                    ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        gemm_arg_,
+                        arg.a_grid_desc_kbatch_k0_m_k1_,
+                        arg.b_grid_desc_kbatch_k0_n_k1_,
+                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                        arg.compute_ptr_offset_of_batch_,
+                        num_k_per_block);
+                }
+                else
+                {
+                    ave_time +=
+                        launch_and_time_kernel(stream_config,
                                               kernel,
-                                              dim3(grid_size),
+                                               dim3(gdx, gdy, gdz),
                                               dim3(BlockSize),
                                               0,
-                                              arg.p_a_grid_,
+                                               gemm_arg,
-                                              arg.p_b_grid_,
-                                              arg.p_c_grid_,
-                                              arg.a_element_op_,
-                                              arg.b_element_op_,
-                                              arg.c_element_op_,
-                                              arg.Conv_G_,
                                               arg.a_grid_desc_kbatch_k0_m_k1_,
                                               arg.b_grid_desc_kbatch_k0_n_k1_,
                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.block_2_ctile_map_,
+                                               arg.compute_ptr_offset_of_batch_,
-                                              arg.compute_ptr_offset_of_batch_);
+                                               num_k_per_block);
+                }
            };
-            if(has_main_k0_block_loop)
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                            remove_reference_t<
+                                DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            NumGroupsToMerge,
+                            true,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                            remove_reference_t<
+                                DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            NumGroupsToMerge,
+                            true,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Full>;
+                            Run(kernel);
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Full>;
+                            Run(kernel);
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
                        {
-                return launch_kernel(integral_constant<bool, true>{});
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    NumGroupsToMerge,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
                        }
                        else
                        {
-                return launch_kernel(integral_constant<bool, false>{});
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
                        }
                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                NumGroupsToMerge,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                            remove_reference_t<
+                                DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            NumGroupsToMerge,
+                            false,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                            remove_reference_t<
+                                DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            NumGroupsToMerge,
+                            false,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+            return ave_time;
+        }
        float Run(const BaseArgument* p_arg,
                  const StreamConfig& stream_config = StreamConfig{}) override
@@ -580,10 +1171,28 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
    static bool IsSupportedArgument(const Argument& arg)
    {
+        const index_t GemmM = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+        const index_t GemmN = arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
+        const index_t GemmK = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) *
+                              arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2);
+        typename GridwiseGemm::Argument gemm_arg{
+            nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+        const auto num_k_loop = gemm_arg.AK0 / (K0PerBlock / K1);
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= GridwiseGemm::BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
        if(!ck::is_xdl_supported())
        {
            return false;
        }
        if constexpr(NDimSpatial == 1)
        {
            if constexpr(!is_GNWC_GKXC_GNWK<InLayout, WeiLayout, OutLayout>())
@@ -626,10 +1235,38 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
            }
        }
+        if constexpr(NumGroupsToMerge > 1)
+        {
+            // support only if whole M and N can be proccessed on one block
+            if(!(GemmM <= MPerBlock && GemmN <= NPerBlock))
+            {
+                return false;
+            }
+            if(!(arg.Conv_C_ == 1 && arg.Conv_K_ == 1))
+            {
+                return false;
+            }
+            if(arg.Conv_G_ % NumGroupsToMerge != 0)
+            {
+                return false;
+            }
+        }
+        if(!(arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0))
+        {
+            if(!(arg.Conv_K_ == 1 && arg.compute_ptr_offset_of_batch_.BatchStrideA_ == 1))
+            {
+                return false;
+            }
+            if(!(arg.Conv_C_ == 1 && arg.compute_ptr_offset_of_batch_.BatchStrideB_ == 1))
+            {
+                return false;
+            }
+        }
        // vector load A/B matrix from global memory
-        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 2 &&
+        if(!(ABlockTransferSrcVectorDim == 1 && BBlockTransferSrcVectorDim == 1))
-             arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0 &&
-             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
        {
            return false;
        }
@@ -641,10 +1278,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        }
        // Gridwise GEMM size
-        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+        return true;
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
-                                           arg.c_grid_desc_m_n_,
-                                           arg.block_2_ctile_map_);
    }
    bool IsSupportedArgument(const BaseArgument* p_arg) override
@@ -766,6 +1400,21 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        return str.str();
    }
+    void SetWorkSpacePointer(BaseArgument* p_arg,
+                             void* p_workspace,
+                             const StreamConfig& = StreamConfig{}) const override
+    {
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->p_workspace_ = p_workspace;
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle::Argument structure!");
+    }
 };
 } // namespace device

--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
@@ -34,6 +34,94 @@ struct TransformConvBwdWeightToGemmV2
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_out_grid_desc(const index_t N,
+                       const index_t Wo,
+                       const index_t K,
+                       const std::array<index_t, NDimSpatial + 3>& output_strides)
+    {
+        const index_t BatchStride = output_strides[0];
+        const index_t WoStride    = output_strides[3];
+        const auto KStride        = Number<1>{};
+        return make_naive_tensor_descriptor(make_tuple(N * Wo, NumGroupsToMerge, K),
+                                            make_tuple(WoStride, BatchStride, KStride));
+    }
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_in_grid_desc(const index_t N,
+                      const index_t Wi,
+                      const index_t C,
+                      const std::array<index_t, NDimSpatial + 3>& input_strides)
+    {
+        const index_t BatchStride = input_strides[0];
+        const index_t NStride     = input_strides[1];
+        const index_t WiStride    = input_strides[3];
+        const auto CStride        = input_strides[2];
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            return make_naive_tensor_descriptor(make_tuple(N * Wi, NumGroupsToMerge, C),
+                                                make_tuple(WiStride, BatchStride, CStride));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N, Wi, NumGroupsToMerge, C),
+                make_tuple(NStride, WiStride, BatchStride, CStride));
+        }
+    }
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_wei_grid_desc(const index_t K,
+                       const index_t X,
+                       const index_t C,
+                       const std::array<index_t, NDimSpatial + 3>& weights_strides)
+    {
+        const auto CStride     = Number<1>{};
+        const auto KStride     = weights_strides[1];
+        const auto XStride     = weights_strides[3];
+        const auto BatchStride = weights_strides[0];
+        // Add NumGroupsToMerge for Batch+M dimension and, 1 as a placehorder
+        // for Batch+N dimension
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(NumGroupsToMerge, K, X, 1, C),
+            make_tuple(BatchStride, KStride, XStride, BatchStride, CStride));
+        // Padd 1 to NumGroupsToMerge
+        const auto padded_desc = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_pass_through_transform(NumGroupsToMerge),
+                       make_pass_through_transform(K),
+                       make_pass_through_transform(X),
+                       make_pad_transform(1, 0, NumGroupsToMerge - 1),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+        // We need only matrices from diagonal. Xor returns 0 for the same
+        // values. So if matrices is not on diagonal then it will be stored in padding.
+        // To avoid use of modulo after xor we assume that NumBatch to merge is power of 2.
+        static_assert(NumGroupsToMerge == 1 || NumGroupsToMerge == 2 || NumGroupsToMerge == 4 ||
+                      NumGroupsToMerge == 8 || NumGroupsToMerge == 16 || NumGroupsToMerge == 32 ||
+                      NumGroupsToMerge == 64);
+        const auto unmerged_padded_desc = transform_tensor_descriptor(
+            padded_desc,
+            make_tuple(make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                       make_pass_through_transform(K),
+                       make_pass_through_transform(X),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}),
+            make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}));
+        // Merge To M, N
+        return transform_tensor_descriptor(
+            unmerged_padded_desc,
+            make_tuple(make_merge_transform(make_tuple(NumGroupsToMerge, K)),
+                       make_merge_transform(make_tuple(X, NumGroupsToMerge, C))),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3, 4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
    template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
    constexpr static auto
    make_out_grid_desc(const index_t N,
@@ -221,6 +309,187 @@ struct TransformConvBwdWeightToGemmV2
            make_tuple(Sequence<0>{}, Sequence<1>{}));
    }
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        const index_t N,
+        const index_t K,
+        const index_t C,
+        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+        const std::array<index_t, NDimSpatial + 3>& input_strides,
+        const std::array<index_t, NDimSpatial + 3>& weights_strides,
+        const std::array<index_t, NDimSpatial + 3>& output_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const index_t batch_k)
+    {
+        using namespace ck;
+        const index_t Wi = input_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[0];
+        const index_t ConvStrideW = conv_filter_strides[0];
+        const index_t ConvDilationW = conv_filter_dilations[0];
+        const index_t InLeftPadW = input_left_pads[0];
+        const index_t InRightPadW = input_right_pads[0];
+        const index_t GemmKTotal = N * Wo;
+        const index_t GemmM      = K * NumGroupsToMerge;
+        const index_t GemmN      = C * X * NumGroupsToMerge;
+        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+        const auto out_grid_desc = make_out_grid_desc<NDim>(N, Wo, K, output_strides);
+        const auto in_grid_desc  = make_in_grid_desc<NDim>(N, Wi, C, input_strides);
+        const auto wei_grid_desc = make_wei_grid_desc<NDim>(K, X, C, weights_strides);
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmM / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            // B: input tensor
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmN / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_grid_desc);
+        }
+        else
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmM / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            // B: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(NumGroupsToMerge),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(NumGroupsToMerge),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4>{}));
+            const auto in_gemmktotal_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(X, NumGroupsToMerge, C)),
+                           make_merge_transform(make_tuple(N, Wo))),
+                make_tuple(Sequence<1, 3, 4>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+                              wei_gemmm_gemmn_pad_grid_desc);
+        }
+    } // function end
    template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
        const index_t N,

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
@@ -44,32 +44,24 @@ template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
          typename ELayout,
-          ConvolutionBackwardWeightSpecialization ConvSpec>
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances = std::tuple<
    // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        //################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,   32,   8,   16,   16,    1,    1,    S<4, 8,  1>,        S<2, 0, 1>,     S<1, 0, 2>,            1,              1,              4,      false,      S<4, 8,  1>,    S<2, 0, 1>,     S<1, 0, 2>,             1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 1>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,   32,   8,   32,   32,    1,    1,    S<4, 8,  1>,        S<2, 0, 1>,     S<1, 0, 2>,            1,              2,              2,      false,      S<4, 16, 1>,    S<2, 0, 1>,     S<1, 0, 2>,             1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 2>,
-        // instances for small conv.K and conv.C
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,   32,   8,   32,   32,    1,    2,    S<4, 8,  1>,        S<2, 0, 1>,     S<1, 0, 2>,            1,              4,              4,      false,      S<4, 16, 1>,    S<2, 0, 1>,     S<1, 0, 2>,             1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,   32,   8,   32,   32,    1,    4,    S<4, 4,  1>,        S<2, 0, 1>,     S<1, 0, 2>,            1,              8,              8,      false,      S<4, 16, 1>,    S<2, 0, 1>,     S<1, 0, 2>,             1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,   32,   8,   16,   16,    1,    1,    S<4, 8,  1>,        S<2, 0, 1>,     S<1, 0, 2>,            1,              1,              4,      false,       S<4, 8, 1>,    S<2, 0, 1>,     S<1, 0, 2>,             1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 1>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,   32,   8,   32,   32,    1,    1,    S<4, 16, 1>,        S<2, 0, 1>,     S<1, 0, 2>,            1,              2,              2,      false,       S<4, 8, 1>,    S<2, 0, 1>,     S<1, 0, 2>,             1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 2>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,   32,   8,   32,   32,    2,    1,    S<4, 16, 1>,        S<2, 0, 1>,     S<1, 0, 2>,            1,              4,              4,      false,       S<4, 8, 1>,    S<2, 0, 1>,     S<1, 0, 2>,             1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,   32,   8,   32,   32,    4,    1,    S<4, 16, 1>,        S<2, 0, 1>,     S<1, 0, 2>,            1,              8,              8,      false,       S<4, 4, 1>,    S<2, 0, 1>,     S<1, 0, 2>,             1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 8>
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
    // clang-format on
    >;
@@ -77,34 +69,32 @@ template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
          typename ELayout,
-          ConvolutionBackwardWeightSpecialization ConvSpec>
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances = std::tuple<
    // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        //################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              1,              4,      false,  S<4, 8,  1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 1, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              2,              2,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               4, Scheduler, PipelineVersion, 2, F16, F16>,
-        // instance for small conv.K
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              4,              4,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, F16, F16>,
-        // for fp16 conv.K and conv.C must be divisible by 2
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              8,              8,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,               2, Scheduler, PipelineVersion, 8, F16, F16>,
-        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 16, 1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              2,              2,      false,  S<4, 8,  1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               4, Scheduler, PipelineVersion, 2, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2>,  
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              4,              4,      false,  S<4, 8,  1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              8,              8,      false,  S<4, 4,  1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,               8, Scheduler, PipelineVersion, 8, F16, F16>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              2,              2,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               4, Scheduler, PipelineVersion, 2, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              4,              4,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              8,              8,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,               2, Scheduler, PipelineVersion, 8, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              4,              4,      false,  S<4, 8,  1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              8,              8,      false,  S<4, 4,  1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,               2, Scheduler, PipelineVersion, 8, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              2,              2,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               4, Scheduler, PipelineVersion, 2, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              4,              4,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              8,              8,      false,  S<4, 16, 1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,               2, Scheduler, PipelineVersion, 8, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              4,              4,      false,  S<4, 8,  1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, F16, F16>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,        S<2, 0, 1>,  S<1, 0, 2>,                 1,              8,              8,      false,  S<4, 4,  1>,       S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,               2, Scheduler, PipelineVersion, 8, F16, F16>
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
    // clang-format on
    >;
@@ -112,32 +102,31 @@ template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
          typename ELayout,
-          ConvolutionBackwardWeightSpecialization ConvSpec>
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances = std::tuple<
    // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|               ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                     Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|             Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                           |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        //################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
        // generic instance
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,    32,  8,   32,   32,    2,    2,     S<4, 8,  2>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              1,              4,      false,     S<4, 8,  2>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              1,              4,     false,           1,           1,   S<1, 16, 1, 4>,               2, Scheduler, PipelineVersion, 1>,
        // instance for small conv.K                                                                                                                                                                                                                                                                                                                                                                                                                                                          
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1>, 
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,    32,  8,   32,   32,    1,    2,     S<4, 4,  4>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              1,              2,      false,     S<4, 8,  2>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              4,     false,           1,           1,   S<1, 16, 1, 4>,               2, Scheduler, PipelineVersion, 4>,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,    32,  8,   32,   32,    4,    2,     S<4, 32, 2>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              4,      false,     S<4, 16, 4>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              2,     false,           1,           1,   S<1, 32, 1, 8>,               4, Scheduler, PipelineVersion, 4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,    32,  8,   32,   32,    2,    4,     S<4, 16, 4>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              2,      false,     S<4, 32, 2>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              4,     false,           1,           1,   S<1, 32, 1, 8>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,    32,  8,   32,   32,    4,    2,     S<4, 16, 2>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              4,      false,     S<4, 16, 2>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              4,     false,           1,           1,   S<1, 32, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,    32,  8,   32,   32,    2,    2,     S<4, 16, 4>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              2,      false,     S<4, 16, 4>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              2,     false,           1,           1,   S<1, 32, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,    32,  8,   32,   32,    2,    2,     S<4, 16, 2>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              4,      false,     S<4, 8,  4>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              2,     false,           1,           1,   S<1, 32, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,    32,  8,   32,   32,    2,    2,     S<4, 8,  4>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              2,      false,     S<4, 16, 2>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              4,     false,           1,           1,   S<1, 32, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,    32,  8,   32,   32,    2,    2,     S<4, 8,  2>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              4,      false,     S<4, 8,  2>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              4,     false,           1,           1,   S<1, 16, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,    32,  8,   32,   32,    2,    1,     S<4, 16, 4>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              2,      false,     S<4, 8,  8>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              1,     false,           1,           1,   S<1, 32, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,    32,  8,   32,   32,    1,    2,     S<4, 8,  8>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              1,      false,     S<4, 16, 4>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              2,     false,           1,           1,   S<1, 32, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,    32,  8,   32,   32,    1,    2,     S<4, 4,  8>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              1,      false,     S<4, 16, 2>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              4,     false,           1,           1,   S<1, 32, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,    32,  8,   32,   32,    2,    1,     S<4, 8,  2>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              4,      false,     S<4, 4,  4>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              2,     false,           1,           1,   S<1, 16, 1, 4>,               2, Scheduler, PipelineVersion, 4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,    32,  8,   32,   32,    1,    2,     S<4, 4,  4>,       S<2, 0, 1>,    S<1, 0, 2>,              1,              8,              2,      false,     S<4, 8,  2>,   S<2, 0, 1>,       S<1, 0, 2>,             1,              8,              4,     false,           1,           1,   S<1, 16, 1, 4>,               2, Scheduler, PipelineVersion, 4>
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
    // clang-format on
    >;
@@ -145,34 +134,22 @@ template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
          typename ELayout,
-          ConvolutionBackwardWeightSpecialization ConvSpec>
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple<
    // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        //################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
-        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,    32,  8,   16,   16,    1,    1,    S<4, 8,  1>,       S<2, 0, 1>,     S<1, 0, 2>,               1,              1,              4,     false,    S<4, 8, 1>,      S<2, 0, 1>,    S<1, 0, 2>,               1,              1,              4,    false,           1,           1,   S<1, 8, 1, 8>,                 2, Scheduler, PipelineVersion, 1>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,    32,  8,   32,   32,    1,    1,    S<4, 8,  1>,       S<2, 0, 1>,     S<1, 0, 2>,               1,              2,              2,     false,    S<4, 16, 1>,     S<2, 0, 1>,    S<1, 0, 2>,               1,              2,              2,    false,           1,           1,   S<1, 8, 1, 8>,                 4, Scheduler, PipelineVersion, 2>,
-        // instance for small conv.K
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,    32,  8,   32,   32,    1,    2,    S<4, 8,  1>,       S<2, 0, 1>,     S<1, 0, 2>,               1,              4,              4,     false,    S<4, 16, 1>,     S<2, 0, 1>,    S<1, 0, 2>,               1,              4,              4,    false,           1,           1,   S<1, 8, 1, 8>,                 2, Scheduler, PipelineVersion, 4>,
-        // for bf16 conv.K and conv.C must be divisible by 2
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,    32,  8,   32,   32,    1,    4,    S<4, 4,  1>,       S<2, 0, 1>,     S<1, 0, 2>,               1,              8,              8,     false,    S<4, 16, 1>,     S<2, 0, 1>,    S<1, 0, 2>,               1,              8,              8,    false,           1,           1,   S<1, 8, 1, 8>,                 2, Scheduler, PipelineVersion, 8>,
-        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,    32,  8,   32,   32,    1,    1,    S<4, 16, 1>,       S<2, 0, 1>,     S<1, 0, 2>,               1,              2,              2,     false,    S<4, 8, 1>,      S<2, 0, 1>,    S<1, 0, 2>,               1,              2,              2,    false,           1,           1,   S<1, 8, 1, 8>,                 4, Scheduler, PipelineVersion, 2>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2>,  
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,    32,  8,   32,   32,    2,    1,    S<4, 16, 1>,       S<2, 0, 1>,     S<1, 0, 2>,               1,              4,              4,     false,    S<4, 8, 1>,      S<2, 0, 1>,    S<1, 0, 2>,               1,              4,              4,    false,           1,           1,   S<1, 8, 1, 8>,                 2, Scheduler, PipelineVersion, 4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,    32,  8,   32,   32,    4,    1,    S<4, 16, 1>,       S<2, 0, 1>,     S<1, 0, 2>,               1,              8,              8,     false,    S<4, 4, 1>,      S<2, 0, 1>,    S<1, 0, 2>,               1,              8,              8,    false,           1,           1,   S<1, 8, 1, 8>,                 2, Scheduler, PipelineVersion, 8>
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
    // clang-format on
    >;
@@ -180,35 +157,35 @@ template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
          typename ELayout,
-          ConvolutionBackwardWeightSpecialization ConvSpec>
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_instances = std::tuple<
 // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|
+        //################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|
+        //################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        |
+        //################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |
+        //################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
 #if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,    32,   8,  16,   16,    1,    1,     S<4, 8,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              1,              4,      false,    S<4, 8,  1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 1, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2,       BF8,     F8>,   
-        // instance for small conv.K
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,    32,   8,  32,   32,    1,    1,     S<4, 8,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              2,              2,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 2, BF8, F8>,
-        // for fp16 conv.K and conv.C must be divisible by 2
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,    32,   8,  32,   32,    1,    2,     S<4, 8,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              4,              4,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, BF8, F8>,
-        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,    32,   8,  32,   32,    1,    4,     S<4, 4,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              8,              8,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,               2, Scheduler, PipelineVersion, 8, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2,       BF8,     F8>,  
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,    32,   8,  32,   32,    1,    1,     S<4, 16, 1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              2,              2,      false,    S<4, 8,  1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               4, Scheduler, PipelineVersion, 2, BF8, F8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,    32,   8,  32,   32,    2,    1,     S<4, 16, 1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              4,              4,      false,    S<4, 8,  1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,    32,   8,  32,   32,    4,    1,     S<4, 16, 1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              8,              8,      false,    S<4, 4,  1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,               2, Scheduler, PipelineVersion, 8, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,    32,   8,  32,   32,    1,    1,     S<4, 8,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              2,              2,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               4, Scheduler, PipelineVersion, 2, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,    32,   8,  32,   32,    1,    2,     S<4, 8,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              4,              4,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,    32,   8,  32,   32,    1,    4,     S<4, 4,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              8,              8,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,               2, Scheduler, PipelineVersion, 8, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,    32,   8,  32,   32,    2,    1,     S<4, 16, 1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              4,              4,      false,    S<4, 8,  1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,    32,   8,  32,   32,    4,    1,     S<4, 16, 1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              8,              8,      false,    S<4, 4,  1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,               2, Scheduler, PipelineVersion, 8, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,    32,   8,  32,   32,    1,    1,     S<4, 8,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              2,              2,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,               4, Scheduler, PipelineVersion, 2, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,    32,   8,  32,   32,    1,    2,     S<4, 8,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              4,              4,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,    32,   8,  32,   32,    1,    4,     S<4, 4,  1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              8,              8,      false,    S<4, 16, 1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 4, 1, 8>,               2, Scheduler, PipelineVersion, 8, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,    32,   8,  32,   32,    2,    1,     S<4, 16, 1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              4,              4,      false,    S<4, 8,  1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,               2, Scheduler, PipelineVersion, 4, BF8, F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,    32,   8,  32,   32,    4,    1,     S<4, 16, 1>,       S<2, 0, 1>,    S<1, 0, 2>,               1,              8,              8,      false,    S<4, 4,  1>,     S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 4>,               2, Scheduler, PipelineVersion, 8, BF8, F8>
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>
 #endif
    // clang-format on
    >;

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -276,7 +276,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
                             is_same_v<ComputeTypeB, float>)
                {
-                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_pad0_pipev5_instances(
+                        op_ptrs);
                }
 #endif
 #ifdef CK_ENABLE_FP16
@@ -284,7 +291,14 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
                             is_same_v<ComputeTypeB, half_t>)
                {
-                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev5_instances(
+                        op_ptrs);
                }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -293,7 +307,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
                             is_same_v<ComputeTypeB, ck::bhalf_t>)
                {
-                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev5_instances(
                        op_ptrs);
                }
 #endif
@@ -309,7 +329,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
                             is_same_v<ComputeTypeB, float>)
                {
-                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev5_instances(
                        op_ptrs);
                }
 #endif
@@ -318,7 +344,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
                             is_same_v<ComputeTypeB, half_t>)
                {
-                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev2_instances(
                        op_ptrs);
                }
 #endif
@@ -328,7 +360,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
                             is_same_v<ComputeTypeB, ck::bhalf_t>)
                {
-                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances(
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_pad0_pipev5_instances(
                        op_ptrs);
                }
 #endif
@@ -341,7 +379,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
                             is_same_v<ComputeTypeB, float>)
                {
-                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instances(
                        op_ptrs);
                }
 #endif
@@ -350,7 +394,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
                             is_same_v<ComputeTypeB, half_t>)
                {
-                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instances(
                        op_ptrs);
                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
                        op_ptrs);
@@ -366,7 +416,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
                             is_same_v<ComputeTypeB, ck::bhalf_t>)
                {
-                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_pad0_pipev5_instances(
                        op_ptrs);
                }
                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
@@ -375,7 +431,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
                             is_same_v<ComputeTypeB, ck::bhalf_t>)
                {
-                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instances(
                        op_ptrs);
                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances(
                        op_ptrs);
@@ -429,7 +491,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
                             is_same_v<ComputeTypeB, float>)
                {
-                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_pad0_pipev5_instances(
                        op_ptrs);
                }
 #endif
@@ -438,7 +506,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
                             is_same_v<ComputeTypeB, half_t>)
                {
-                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_pad0_pipev5_instances(
                        op_ptrs);
                }
 #endif
@@ -448,7 +522,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
                             is_same_v<ComputeTypeB, ck::bhalf_t>)
                {
-                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_pad0_pipev5_instances(
                        op_ptrs);
                }
 #endif
@@ -461,7 +541,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
                             is_same_v<ComputeTypeB, float>)
                {
-                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
                        op_ptrs);
                }
 #endif
@@ -470,7 +556,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
                             is_same_v<ComputeTypeB, half_t>)
                {
-                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instances(
                        op_ptrs);
                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances(
                        op_ptrs);
@@ -486,7 +578,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
                             is_same_v<ComputeTypeB, ck::bhalf_t>)
                {
-                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_pad0_pipev5_instances(
                        op_ptrs);
                }
                if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
@@ -495,7 +593,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
                             is_same_v<ComputeTypeB, ck::bhalf_t>)
                {
-                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instances(
                        op_ptrs);
                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instances(
                        op_ptrs);
@@ -510,7 +614,13 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, bf8_t> &&
                             is_same_v<ComputeTypeB, f8_t>)
                {
-                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_pad0_pipev5_instances(
                        op_ptrs);
                }
 #endif

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -10,7 +10,40 @@ namespace instance {
 // conv1d backward weight
 #ifdef CK_ENABLE_BF16
-void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
                                                           GNWC,
                                                           GKXC,
@@ -23,7 +56,40 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_insta
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP16
-void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
                                                           GNWC,
                                                           GKXC,
@@ -36,7 +102,40 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
-void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
                                                           GNWC,
                                                           GKXC,
@@ -50,7 +149,40 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(
 #endif
 // conv2d backward weight
 #ifdef CK_ENABLE_BF16
-void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           GNHWC,
                                                           GKYXC,
@@ -63,7 +195,40 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_in
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           GNHWC,
                                                           GKYXC,
@@ -76,7 +241,40 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
-void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           GNHWC,
                                                           GKYXC,
@@ -89,7 +287,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
-void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           NHWGC,
                                                           GKYXC,
@@ -100,8 +298,73 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instances(
-void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           NHWGC,
                                                           GKYXC,
@@ -186,7 +449,76 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_p
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP16
-void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           NHWGC,
                                                           GKYXC,
@@ -197,72 +529,175 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
-                                                           NHWGC,
+                                                           NGCHW,
                                                           GKYXC,
-                                                           NHWGK,
+                                                           NGKHW,
                                                           F16,
                                                           F16,
                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+#endif
+// conv3d backward weight
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_pad0_pipev5_instances(
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNDHWC,
-                                                           NHWGC,
+                                                           GKZYXC,
-                                                           GKYXC,
+                                                           GNDHWK,
-                                                           NHWGK,
+                                                           BF16,
-                                                           F16,
+                                                           F32,
-                                                           F16,
+                                                           BF16,
-                                                           F16,
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+#endif
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instances(
+#ifdef CK_ENABLE_FP16
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_default_pipev2_instances(
-                                                           NHWGC,
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           GKYXC,
+                                                           GNDHWC,
-                                                           NHWGK,
+                                                           GKZYXC,
+                                                           GNDHWK,
                                                           F16,
                                                           F16,
                                                           F16,
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_default_pipev5_instances(
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNDHWC,
-                                                           NGCHW,
+                                                           GKZYXC,
-                                                           GKYXC,
+                                                           GNDHWK,
-                                                           NGKHW,
                                                           F16,
                                                           F16,
                                                           F16,
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_pad0_pipev2_instances(
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNDHWC,
-                                                           NGCHW,
+                                                           GKZYXC,
-                                                           GKYXC,
+                                                           GNDHWK,
-                                                           NGKHW,
                                                           F16,
                                                           F16,
                                                           F16,
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_pad0_pipev5_instances(
-void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNDHWC,
-                                                           NGCHW,
+                                                           GKZYXC,
-                                                           GKYXC,
+                                                           GNDHWK,
-                                                           NGKHW,
                                                           F16,
                                                           F16,
                                                           F16,
@@ -271,47 +706,40 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pi
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
-void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_default_pipev2_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
-                                                           NHWGC,
+                                                           GNDHWC,
-                                                           GKYXC,
+                                                           GKZYXC,
-                                                           NHWGK,
+                                                           GNDHWK,
                                                           F32,
                                                           F32,
                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
-#endif
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_default_pipev5_instances(
-// conv3d backward weight
-#ifdef CK_ENABLE_BF16
-void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           GNDHWC,
                                                           GKZYXC,
                                                           GNDHWK,
-                                                           BF16,
                                                           F32,
-                                                           BF16,
+                                                           F32,
+                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
-#endif
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_pad0_pipev2_instances(
-#ifdef CK_ENABLE_FP16
-void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           GNDHWC,
                                                           GKZYXC,
                                                           GNDHWK,
-                                                           F16,
+                                                           F32,
-                                                           F16,
+                                                           F32,
-                                                           F16,
+                                                           F32,
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
-#endif
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_pad0_pipev5_instances(
-#ifdef CK_ENABLE_FP32
-void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           GNDHWC,
                                                           GKZYXC,
@@ -324,7 +752,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
-void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           NDHWGC,
                                                           GKZYXC,
@@ -335,8 +763,73 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instances(
-void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           NDHWGC,
                                                           GKZYXC,
@@ -421,7 +914,40 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf1
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP16
-void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           NDHWGC,
                                                           GKZYXC,
@@ -506,7 +1032,40 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
-void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           NDHWGC,
                                                           GKZYXC,
@@ -519,7 +1078,46 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
                                                           PassThrough>>>& instances);
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
-void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           BF8,
+                                                           F8>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           BF8,
+                                                           F8>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           BF8,
+                                                           F8>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_pad0_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                           NDHWGC,
                                                           GKZYXC,

--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt
 # ONLY XDL_AND_DL_KERNELS
 set(GROUPED_CONV1D_BWD_WEIGHT
-    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev2_instance.cpp
-    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev5_instance.cpp
-    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp)
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev2_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev5_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev2_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev5_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev2_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev5_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev2_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev5_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_pad0_pipev2_instance.cpp
+    xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_pad0_pipev5_instance.cpp)
 if(DL_KERNELS)
    list(APPEND GROUPED_CONV1D_BWD_WEIGHT
@@ -15,3 +24,4 @@ if(DL_KERNELS)
 endif()
 add_instance_library(device_grouped_conv1d_bwd_weight_instance ${GROUPED_CONV1D_BWD_WEIGHT})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev2_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
                                                           GNWC,
                                                           GKXC,
@@ -21,7 +21,6 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_insta
                                                           PassThrough,
                                                           PassThrough>>>& instances)
 {
-    // 1. Default
    add_device_operation_instances(
        instances,
        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
@@ -29,16 +28,9 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_insta
            GNWC,
            GKXC,
            GNWK,
-            ConvBwdWeightDefault>{});
+            ConvBwdWeightDefault,
-    // 2. Filter1x1Stride1Pad0
+            BlockGemmPipelineScheduler::Intrawave,
-    add_device_operation_instances(
+            BlockGemmPipelineVersion::v2>{});
-        instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
-            1,
-            GNWC,
-            GKXC,
-            GNWK,
-            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev5_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            1,
+            GNWC,
+            GKXC,
+            GNWK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v5>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev2_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            1,
+            GNWC,
+            GKXC,
+            GNWK,
+            ConvBwdWeightFilter1x1Stride1Pad0,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v2>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev5_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+            1,
+            GNWC,
+            GKXC,
+            GNWK,
+            ConvBwdWeightFilter1x1Stride1Pad0,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v5>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev2_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
+                                       1,
+                                       GNWC,
+                                       GKXC,
+                                       GNWK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev5_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
+                                       1,
+                                       GNWC,
+                                       GKXC,
+                                       GNWK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev2_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
                                                           GNWC,
                                                           GKXC,
@@ -21,22 +21,15 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(
                                                           PassThrough,
                                                           PassThrough>>>& instances)
 {
-    // 1. Default
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<1,
-                                                                   GNWC,
-                                                                   GKXC,
-                                                                   GNWK,
-                                                                   ConvBwdWeightDefault>{});
-    // 2. Filter1x1Stride1Pad0
    add_device_operation_instances(instances,
                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
                                       1,
                                       GNWC,
                                       GKXC,
                                       GNWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
 }
 } // namespace instance

--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev5_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
+                                       1,
+                                       GNWC,
+                                       GKXC,
+                                       GNWK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev2_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev2_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
+                                       1,
+                                       GNWC,
+                                       GKXC,
+                                       GNWK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev5_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev5_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
+                                       1,
+                                       GNWC,
+                                       GKXC,
+                                       GNWK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/xdl/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -9,7 +9,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_pad0_pipev2_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
                                                           GNWC,
                                                           GKXC,
@@ -21,22 +21,15 @@ void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(
                                                           PassThrough,
                                                           PassThrough>>>& instances)
 {
-    // 1. Default
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<1,
-                                                                   GNWC,
-                                                                   GKXC,
-                                                                   GNWK,
-                                                                   ConvBwdWeightDefault>{});
-    // 2. Filter1x1Stride1Pad0
    add_device_operation_instances(instances,
                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
                                       1,
                                       GNWC,
                                       GKXC,
                                       GNWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
 }
 } // namespace instance