Merge branch 'develop' into wavelet_model

7e493730 · Adam Osewski · b89a88b5 · 40942b90 · 7e493730 · 7e493730
Commit 7e493730 authored Oct 13, 2022 by Adam Osewski
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -503,13 +503,9 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+            if(!DeviceOp::IsSupportedArgument(arg))
-                                            arg.b_grid_desc_bk0_n_bk1_,
-                                            arg.b1_grid_desc_bk0_n_bk1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
            {
-                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+                throw std::runtime_error("wrong! unsupported argument");
            }
            const index_t grid_size =

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp
@@ -333,10 +333,6 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumGemmKPrefetchStage,
        BlockSize,
        MPerBlock,
@@ -370,12 +366,19 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -478,10 +481,9 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
        // tensor descriptors for block/thread-wise copy
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
        // for calculating batch offset
        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
@@ -520,21 +522,21 @@ struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
            auto launch_kernel = [&](auto has_main_k_block_loop) {
                constexpr bool has_main_loop = has_main_k_block_loop.value;
-                const auto kernel = kernel_batched_gemm_xdl<
+                const auto kernel =
-                    GridwiseGemm,
+                    kernel_batched_gemm_xdl<GridwiseGemm,
-                    ADataType, // TODO: distiguish A/B datatype
+                                            ADataType, // TODO: distiguish A/B datatype
-                    typename GridwiseGemm::DsGridPointer,
+                                            typename GridwiseGemm::DsGridPointer,
-                    EDataType,
+                                            EDataType,
-                    AElementwiseOperation,
+                                            AElementwiseOperation,
-                    BElementwiseOperation,
+                                            BElementwiseOperation,
-                    CDEElementwiseOperation,
+                                            CDEElementwiseOperation,
-                    DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
-                    DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    ComputePtrOffsetOfStridedBatch,
+                                            ComputePtrOffsetOfStridedBatch,
-                    Block2ETileMap,
+                                            Block2ETileMap,
-                    has_main_loop>;
+                                            has_main_loop>;
                return launch_and_time_kernel(stream_config,
                                              kernel,

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -35,6 +35,7 @@ template <typename GridwiseGemm,
          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
          typename Block2CTileMap,
          typename ComputeBasePtrOfStridedBatch,
+          typename C0MatrixMask,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -57,7 +58,8 @@ __global__ void
                c_grid_desc_mblock_mperblock_nblock_nperblock,
            const Block2CTileMap block_2_ctile_map,
            const index_t batch_count,
-            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+            const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -88,7 +90,8 @@ __global__ void
                                                  b_grid_desc_bk0_n_bk1,
                                                  b1_grid_desc_bk0_n_bk1,
                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_ctile_map);
+                                                  block_2_ctile_map,
+                                                  c0_matrix_mask);
 #else
    ignore = p_a_grid;
    ignore = p_b_grid;
@@ -106,6 +109,7 @@ __global__ void
    ignore = block_2_ctile_map;
    ignore = batch_count;
    ignore = compute_base_ptr_of_batch;
+    ignore = c0_matrix_mask;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
@@ -168,6 +172,7 @@ template <typename ALayout,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          bool MaskOutUpperTriangle,
          LoopScheduler LoopSched = LoopScheduler::Default>
 struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
    : public DeviceBatchedGemmSoftmaxGemmPermute<ALayout,
@@ -194,9 +199,6 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
        GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
            MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock};
-    // FIXME: pad K
-    static_assert(!matrix_padder.PadK, "KPadding is currently not supported");
    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
    {
        const auto a_grid_desc_mraw_kraw = [&]() {
@@ -398,6 +400,29 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
    using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N({}, {}));
    using CGridDesc_G_M_N      = decltype(MakeCGridDescriptor_G_M_N({}, {}));
+    // to track the points which need to be set to -inf on C0
+    // Note: no need to reset M padding value, because they will not be stored out.
+    struct C0MatrixMask
+    {
+        C0MatrixMask(index_t NRaw) : NRaw_(NRaw) {}
+        __host__ __device__ bool IsUpperTriangle(index_t m, index_t n) const { return n > m; }
+        __host__ __device__ bool IsNOutOfBound(/*index_t m, */ index_t n) const
+        {
+            return n >= NRaw_;
+        }
+        __host__ __device__ bool IsMaskedElement(index_t m, index_t n) const
+        {
+            return IsUpperTriangle(m, n) || IsNOutOfBound(n);
+        }
+        private:
+        // index_t MRaw_;
+        index_t NRaw_;
+    };
    struct ComputeBasePtrOfStridedBatch
    {
        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
@@ -498,7 +523,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
        CShuffleBlockTransferScalarPerVector_NPerBlock,
        LoopSched,
-        matrix_padder.PadN>;
+        matrix_padder.PadN,
+        MaskOutUpperTriangle>;
    // Argument
    // FIXME: constness
@@ -548,6 +574,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
              batch_count_(Batch),
              compute_base_ptr_of_batch_{
                  BatchStrideA, BatchStrideB, BatchStrideB1, c_grid_desc_g_m_n_},
+              c0_matrix_mask_{NRaw},
              raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw},
              c_extent_lowest_{c_gs_ms_gemm1ns_lengths.back()},
              c_stride_lowest_{c_gs_ms_gemm1ns_strides.back()}
@@ -585,6 +612,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
        index_t batch_count_;
        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+        // check C0 masking and padding
+        C0MatrixMask c0_matrix_mask_;
        // For robust IsSupportedArgument() check
        std::vector<index_t> raw_lengths_m_n_k_o_;
        index_t c_extent_lowest_;
@@ -632,6 +662,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
                    typename GridwiseGemm::DefaultBlock2CTileMap,
                    ComputeBasePtrOfStridedBatch,
+                    C0MatrixMask,
                    has_main_k_block_loop_>;
                return launch_and_time_kernel(stream_config,
@@ -654,7 +685,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
                                              arg.block_2_ctile_map_,
                                              arg.batch_count_,
-                                              arg.compute_base_ptr_of_batch_);
+                                              arg.compute_base_ptr_of_batch_,
+                                              arg.c0_matrix_mask_);
            };
            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -35,6 +35,7 @@ template <typename GridwiseGemm,
          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
          typename Block2CTileMap,
          typename ComputeBasePtrOfStridedBatch,
+          typename C0MatrixMask,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -57,7 +58,8 @@ __global__ void
                c_grid_desc_mblock_mperblock_nblock_nperblock,
            const Block2CTileMap block_2_ctile_map,
            const index_t batch_count,
-            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+            const C0MatrixMask c0_matrix_mask)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
@@ -88,7 +90,8 @@ __global__ void
                                                  b_grid_desc_bk0_n_bk1,
                                                  b1_grid_desc_bk0_n_bk1,
                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  block_2_ctile_map);
+                                                  block_2_ctile_map,
+                                                  c0_matrix_mask);
 #else
    ignore = p_a_grid;
    ignore = p_b_grid;
@@ -106,6 +109,7 @@ __global__ void
    ignore = block_2_ctile_map;
    ignore = batch_count;
    ignore = compute_base_ptr_of_batch;
+    ignore = c0_matrix_mask;
 #endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
 }
@@ -177,6 +181,7 @@ template <typename ALayout,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          bool MaskOutUpperTriangle,
          LoopScheduler LoopSched = LoopScheduler::Default>
 struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
    : public DeviceBatchedGemmSoftmaxGemm<ALayout,
@@ -203,9 +208,6 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
            MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock};
-    // FIXME: pad K
-    static_assert(!matrix_padder.PadK, "KPadding is currently not supported");
    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
    {
        const auto a_grid_desc_mraw_kraw = [&]() {
@@ -313,6 +315,29 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
    }
+    // to track the points which need to be set to -inf on C0
+    // Note: no need to reset M padding value, because they will not be stored out.
+    struct C0MatrixMask
+    {
+        C0MatrixMask(index_t NRaw) : NRaw_(NRaw) {}
+        __host__ __device__ bool IsUpperTriangle(index_t m, index_t n) const { return n > m; }
+        __host__ __device__ bool IsNOutOfBound(/*index_t m, */ index_t n) const
+        {
+            return n >= NRaw_;
+        }
+        __host__ __device__ bool IsMaskedElement(index_t m, index_t n) const
+        {
+            return IsUpperTriangle(m, n) || IsNOutOfBound(n);
+        }
+        private:
+        // index_t MRaw_;
+        index_t NRaw_;
+    };
    struct ComputeBasePtrOfStridedBatch
    {
        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
@@ -418,7 +443,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
        CShuffleBlockTransferScalarPerVector_NPerBlock,
        LoopSched,
-        matrix_padder.PadN>;
+        matrix_padder.PadN,
+        MaskOutUpperTriangle>;
    // Argument
    struct Argument : public BaseArgument
@@ -463,6 +489,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
              c_element_op_{c_element_op},
              batch_count_(Batch),
              compute_base_ptr_of_batch_{BatchStrideA, BatchStrideB, BatchStrideB1, BatchStrideC},
+              c0_matrix_mask_{NRaw},
              raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw}
        {
            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
@@ -497,6 +524,9 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        index_t batch_count_;
        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+        // check C0 masking and padding
+        C0MatrixMask c0_matrix_mask_;
        // For robust IsSupportedArgument() check
        std::vector<index_t> raw_lengths_m_n_k_o_;
    };
@@ -542,6 +572,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
                    typename GridwiseGemm::DefaultBlock2CTileMap,
                    ComputeBasePtrOfStridedBatch,
+                    C0MatrixMask,
                    has_main_k_block_loop_>;
                return launch_and_time_kernel(stream_config,
@@ -564,7 +595,8 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
                                              arg.block_2_ctile_map_,
                                              arg.batch_count_,
-                                              arg.compute_base_ptr_of_batch_);
+                                              arg.compute_base_ptr_of_batch_,
+                                              arg.c0_matrix_mask_);
            };
            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need

--- a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -320,10 +320,6 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumGemmKPrefetchStage,
        BlockSize,
        MPerBlock,
@@ -357,12 +353,19 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -475,10 +478,9 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
        // tensor descriptors for block/thread-wise copy
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
        // block-to-e-tile map
        Block2ETileMap block_2_etile_map_;
@@ -535,9 +537,9 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
                    CDEElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2ETileMap,
+                    DeviceOp::Block2ETileMap,
                    has_main_loop>;
                return launch_and_time_kernel(stream_config,

--- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
@@ -222,14 +222,9 @@ struct DeviceElementwise
        }
    };
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    static bool IsSupportedArgument(const Argument& arg)
    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(arg.lengths_.back() % MPerThread != 0)
-        if(pArg == nullptr)
-            return false;
-        if(pArg->lengths_.back() % MPerThread != 0)
            return false;
        auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
@@ -247,19 +242,40 @@ struct DeviceElementwise
        bool valid = true;
        static_for<0, NumInput, 1>{}([&](auto I) {
            if(!IsScalarPerVectorValid(
-                   pArg->lengths_, pArg->inStridesArray_[I.value], InScalarPerVectorSeq::At(I)))
+                   arg.lengths_, arg.inStridesArray_[I.value], InScalarPerVectorSeq::At(I)))
                valid = false;
        });
        static_for<0, NumOutput, 1>{}([&](auto I) {
            if(!IsScalarPerVectorValid(
-                   pArg->lengths_, pArg->outStridesArray_[I.value], OutScalarPerVectorSeq::At(I)))
+                   arg.lengths_, arg.outStridesArray_[I.value], OutScalarPerVectorSeq::At(I)))
                valid = false;
        });
        return valid;
    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto
+    MakeArgument(const std::array<index_t, NumDim> lengths,
+                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const std::array<void*, NumOutput> out_dev_buffers,
+                 ElementwiseOperation elementwise_op)
+    {
+        return Argument{lengths,
+                        inStridesArray,
+                        outStridesArray,
+                        in_dev_buffers,
+                        out_dev_buffers,
+                        elementwise_op};
+    }
    std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute_xdl.hpp
@@ -237,10 +237,6 @@ struct DeviceGemmBiasEPermute_Xdl : public DeviceGemmBiasCPermute<AElementwiseOp
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumGemmKPrefetchStage,
        BlockSize,
        MPerBlock,

--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -234,6 +234,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
            Number<NumDTensor>{});
    }
+    // desc for problem definition
    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}, {}))>;
@@ -250,10 +251,6 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumGemmKPrefetchStage,
        BlockSize,
        MPerBlock,
@@ -287,10 +284,19 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -326,7 +332,10 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
+              cde_element_op_{cde_element_op},
+              MRaw_{MRaw},
+              NRaw_{NRaw},
+              KRaw_{KRaw}
        {
            // populate pointer, desc for Ds
            static_for<0, NumDTensor, 1>{}([&](auto i) {
@@ -383,18 +392,22 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        // tensor descriptors for block/thread-wise copy
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
        // block-to-e-tile map
-        typename GridwiseGemm::DefaultBlock2ETileMap block_2_etile_map_;
+        Block2ETileMap block_2_etile_map_;
        // element-wise op
        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation cde_element_op_;
+        // for checking vector load/store
+        index_t MRaw_;
+        index_t NRaw_;
+        index_t KRaw_;
    };
    // Invoker
@@ -429,9 +442,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                    CDEElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::DefaultBlock2ETileMap,
+                    DeviceOp::Block2ETileMap,
                    has_main_loop>;
                return launch_and_time_kernel(stream_config,
@@ -480,6 +493,86 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
            return false;
        }
+        // check vector load/store
+        {
+            using Row = ck::tensor_layout::gemm::RowMajor;
+            using Col = ck::tensor_layout::gemm::ColumnMajor;
+            // check vector load of A
+            if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+            // check vector laod of B
+            if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+            // check vector load of Ds
+            // only support RowMajor for now
+            bool all_valid = true;
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                if constexpr(!is_same_v<DLayout, Row>)
+                {
+                    all_valid = false;
+                }
+            });
+            if(!all_valid)
+            {
+                return false;
+            }
+            // check vector store of E
+            // only support RowMajor for now
+            if constexpr(is_same_v<ELayout, Row>)
+            {
+                if(arg.NRaw_ % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                           arg.b_grid_desc_n_k_,
                                           arg.ds_grid_desc_m_n_,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -365,10 +365,6 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumGemmKPrefetchStage,
        BlockSize,
        MPerBlock,
@@ -402,17 +398,21 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
    struct GroupedContractionBlock2ETileMap
    {
-        static_assert(
+        // block-to-e-tile map
-            std::is_same<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{})),
+        using Block2ETileMap =
-                         typename GridwiseGemm::DefaultBlock2ETileMap>::value,
+            remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
-            "Wrong! Should be the same type name");
        GroupedContractionBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n,
                                         ck::index_t BlockStart)
@@ -441,7 +441,7 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
            return default_block_2_etile_map_.CheckValidity(e_grid_desc_m_n);
        }
-        typename GridwiseGemm::DefaultBlock2ETileMap default_block_2_etile_map_;
+        Block2ETileMap default_block_2_etile_map_;
        ck::index_t block_start_;
    };
@@ -456,10 +456,9 @@ struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
        // tensor descriptors for block/thread-wise copy
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
        // lock-to-e-tile map
        GroupedContractionBlock2ETileMap block_2_etile_map_;

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// Conv backward data multiple D:
+//   input : output image A[G, N, K, Ho, Wo]
+//   input : weight B[G, K, C, Y, X],
+//   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+//   output : input image E[G, N, C, Hi, Wi],
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,                                                 // output image
+        const void* p_b,                                                 // weight
+        const std::array<const void*, NumDTensor>& p_ds,                 // bias
+        void* p_e,                                                       // input image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_k_wos_lengths, // bias
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_k_wos_strides,                                        // bias
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
@@ -34,11 +34,13 @@ struct DeviceGroupedConvFwdMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();
+    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
-        const void* p_a,
+        const void* p_a, // input image
-        const void* p_b,
+        const void* p_b, // weight
        const std::array<const void*, NumDTensor>& p_ds,
-        void* p_e,
+        void* p_e, // output image
        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
@@ -117,7 +117,7 @@ __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        kernel_batch_gemm_multiple_d_xdl_cshuffle(
+        kernel_grouped_conv_fwd_multiple_d_xdl_cshuffle(
            const ABDataType* __restrict__ p_a_grid,
            const ABDataType* __restrict__ p_b_grid,
            DsPointer p_ds_grid,
@@ -136,8 +136,7 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    // offset base pointer for each work-group
-#if 1
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -174,24 +173,6 @@ __global__ void
                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
                                                  block_2_ctile_map);
-#else
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
-                                                  p_b_grid,
-                                                  p_ds_grid,
-                                                  p_e_grid,
-                                                  p_shared,
-                                                  a_element_op,
-                                                  b_element_op,
-                                                  cde_element_op,
-                                                  a_grid_desc_k0_m_k1,
-                                                  b_grid_desc_k0_n_k1,
-                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                                  block_2_ctile_map);
-#endif
 #else
    ignore = p_a_grid;
    ignore = p_b_grid;
@@ -378,6 +359,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
            Number<NumDTensor>{});
    }
+    // desc for problem definition
    using AGridDesc_M_K  = remove_cvref_t<decltype(
        MakeAGridDescriptor_M_K<ALayout>({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
    using BGridDesc_N_K  = remove_cvref_t<decltype(MakeBGridDescriptor_N_K<BLayout>({}, {}))>;
@@ -395,10 +377,6 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumGemmKPrefetchStage,
        BlockSize,
        MPerBlock,
@@ -432,12 +410,19 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
-    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
    // Argument
    struct Argument : public BaseArgument
@@ -467,6 +452,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
              p_b_grid_{static_cast<const BDataType*>(p_b)},
              p_ds_grid_{},
              p_e_grid_{static_cast<EDataType*>(p_e)},
+              num_group_{a_g_n_c_wis_lengths[0]},
              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
                                                                          a_g_n_c_wis_strides,
                                                                          b_g_k_c_xs_lengths,
@@ -561,6 +547,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
        EDataType* p_e_grid_;
        // tensor descriptors for problem definiton
+        index_t num_group_;
        AGridDesc_M_K a_grid_desc_m_k_;
        BGridDesc_N_K b_grid_desc_n_k_;
        DsGridDesc_M_N ds_grid_desc_m_n_;
@@ -569,14 +556,14 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
        // tensor descriptors for block/thread-wise copy
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
        // block-to-e-tile map
        Block2ETileMap block_2_etile_map_;
+        // for computing batch offset
        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
        // element-wise op
@@ -622,8 +609,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
            }
            const index_t grid_size =
-                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) *
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.num_group_;
-                arg.a_g_n_c_wis_lengths_[0]; // Group count
            const auto K =
                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
@@ -631,7 +617,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
            auto launch_kernel = [&](auto has_main_k_block_loop) {
                constexpr bool has_main_loop = has_main_k_block_loop.value;
-                const auto kernel = kernel_batch_gemm_multiple_d_xdl_cshuffle<
+                const auto kernel = kernel_grouped_conv_fwd_multiple_d_xdl_cshuffle<
                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    typename GridwiseGemm::DsGridPointer,
@@ -641,8 +627,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
                    CDEElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
-                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
                    Block2ETileMap,
                    ComputePtrOffsetOfStridedBatch<NumDTensor>,
                    has_main_loop>;
@@ -798,7 +784,8 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
                         is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
                         is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
                         is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
-                         is_same_v<DLayout, ctc::NDHWGK>)
+                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::GK> ||
+                         is_same_v<DLayout, ctc::G_K>)
            {
                const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CPermuteNumDims_G_M_Gemm1N, // Sequence<>
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGroupedGemmSoftmaxGemmPermute : public BaseOperator
+{
+    struct ProblemDesc
+    {
+        // Overall problem shape
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t O;
+        index_t Batch;
+        // Stride for A/B0/B1; layout determined by template args
+        index_t StrideA;
+        index_t StrideB0;
+        index_t StrideB1;
+        index_t BatchStrideA;
+        index_t BatchStrideB0;
+        index_t BatchStrideB1;
+        // Lengths and strides for output C
+        std::vector<index_t> c_gs_ms_os_lengths;
+        std::vector<index_t> c_gs_ms_os_strides;
+    };
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*> p_a_vec,
+                        std::vector<const void*> p_b0_vec,
+                        std::vector<const void*> p_b1_vec,
+                        std::vector<void*> p_c_vec,
+                        std::vector<ProblemDesc> problem_desc_vec,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename GridwiseGemm,
+          typename GroupKernelArg,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_gemm_softmax_gemm_xdl_cshuffle_v1(
+            const void CK_CONSTANT_ADDRESS_SPACE* group_kernel_args,
+            const index_t group_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t block_id = get_block_1d_id();
+    const auto arg_ptr = reinterpret_cast<const GroupKernelArg*>(
+        cast_pointer_to_generic_address_space(group_kernel_args));
+    index_t left     = 0;
+    index_t right    = group_count;
+    index_t group_id = index_t((left + right) / 2);
+    while((!(block_id >= arg_ptr[group_id].block_start_ &&
+             block_id < arg_ptr[group_id].block_end_)) &&
+          left <= right)
+    {
+        if(block_id < arg_ptr[group_id].block_start_)
+        {
+            right = group_id;
+        }
+        else
+        {
+            left = group_id;
+        }
+        group_id = index_t((left + right) / 2);
+    }
+    // per-group batch offset
+    const index_t num_blocks_per_batch = arg_ptr[group_id].num_blocks_per_batch_;
+    const index_t g_idx                = __builtin_amdgcn_readfirstlane(
+        (block_id - arg_ptr[group_id].block_start_) / num_blocks_per_batch);
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(
+        arg_ptr[group_id].compute_base_ptr_of_batch_.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset  = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
+    GridwiseGemm::template Run<HasMainKBlockLoop>(
+        arg_ptr[group_id].p_a_grid_ + a_batch_offset,
+        arg_ptr[group_id].p_b_grid_ + b_batch_offset,
+        arg_ptr[group_id].p_b1_grid_ + b1_batch_offset,
+        arg_ptr[group_id].p_c_grid_ + c_batch_offset,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        acc_element_op,
+        b1_element_op,
+        c_element_op,
+        arg_ptr[group_id].a_grid_desc_ak0_m_ak1_,
+        arg_ptr[group_id].b_grid_desc_bk0_n_bk1_,
+        arg_ptr[group_id].b1_grid_desc_bk0_n_bk1_,
+        arg_ptr[group_id].c_grid_desc_mblock_mperblock_nblock_nperblock_,
+        arg_ptr[group_id].block_2_ctile_map_,
+        arg_ptr[group_id].c0_matrix_mask_);
+#else
+    ignore = group_kernel_args;
+    ignore = group_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = b1_element_op;
+    ignore = c_element_op;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <typename ALayout,
+          typename BLayout, // B0Layout
+          typename B1Layout,
+          typename CPermuteNumDims_G_M_Gemm1N, // Sequence<NumDimG, NumDimM, NumDimGemm1N>
+          typename ADataType,
+          typename BDataType,
+          typename B1DataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock, // Gemm0NPerBlock
+          index_t KPerBlock, // Gemm0KPerBlock
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t B1K1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          bool MaskOutUpperTriangle,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
+    : public DeviceGroupedGemmSoftmaxGemmPermute<ALayout,
+                                                 BLayout,
+                                                 B1Layout,
+                                                 CPermuteNumDims_G_M_Gemm1N,
+                                                 ADataType,
+                                                 BDataType,
+                                                 B1DataType,
+                                                 CDataType,
+                                                 AElementwiseOperation,
+                                                 BElementwiseOperation,
+                                                 AccElementwiseOperation,
+                                                 B1ElementwiseOperation,
+                                                 CElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle;
+    using ProblemDesc =
+        typename DeviceGroupedGemmSoftmaxGemmPermute<ALayout,
+                                                     BLayout,
+                                                     B1Layout,
+                                                     CPermuteNumDims_G_M_Gemm1N,
+                                                     ADataType,
+                                                     BDataType,
+                                                     B1DataType,
+                                                     CDataType,
+                                                     AElementwiseOperation,
+                                                     BElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     B1ElementwiseOperation,
+                                                     CElementwiseOperation>::ProblemDesc;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto matrix_padder =
+        GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock};
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+        const auto a_grid_desc_m_k = matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+        const auto AK0 = K / AK1;
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+        const auto b_grid_desc_n_k = matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+        const auto BK0 = K / BK1;
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+    // Args: Gemm1KRaw, Gemm1NRaw, StrideB1
+    static auto MakeB1GridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b1_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+        const auto b1_grid_desc_n_k = matrix_padder.PadB1Descriptor_N_K(b1_grid_desc_nraw_kraw);
+        const auto N = b1_grid_desc_n_k.GetLength(I0);
+        const auto K = b1_grid_desc_n_k.GetLength(I1);
+        const auto B1K0 = K / B1K1;
+        return transform_tensor_descriptor(
+            b1_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+    // assume C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeCGridDescriptor_M_N(const std::vector<index_t>& c_gs_ms_ns_lengths_vec,
+                                        const std::vector<index_t>& c_gs_ms_ns_strides_vec)
+    {
+        constexpr index_t NumDimG = CPermuteNumDims_G_M_Gemm1N::At(I0);
+        constexpr index_t NumDimM = CPermuteNumDims_G_M_Gemm1N::At(I1);
+        constexpr index_t NumDimN = CPermuteNumDims_G_M_Gemm1N::At(I2); // NumDimGemm1N
+        assert(c_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               c_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+        const auto c_ms_ns_lengths = to_tuple(
+            c_gs_ms_ns_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto c_ms_ns_strides = to_tuple(
+            c_gs_ms_ns_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimN, 1>::type{};
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(c_ms_ns_lengths, mDimIds);
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(c_ms_ns_lengths, nDimIds);
+        // naive tensor C[M0, M1, M2, ..., N0, N1, N2...]
+        const auto c_grid_desc_ms_ns =
+            make_naive_tensor_descriptor(c_ms_ns_lengths, c_ms_ns_strides);
+        // transformed tensor C[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...]
+        const auto c_grid_desc_mraw_nraw = transform_tensor_descriptor(
+            c_grid_desc_ms_ns,
+            make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+            make_tuple(mDimIds, nDimIds),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
+    }
+    // assume C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeCGridDescriptor_G_M_N(const std::vector<index_t>& c_gs_ms_ns_lengths_vec,
+                                          const std::vector<index_t>& c_gs_ms_ns_strides_vec)
+    {
+        constexpr index_t NumDimG = CPermuteNumDims_G_M_Gemm1N::At(I0);
+        constexpr index_t NumDimM = CPermuteNumDims_G_M_Gemm1N::At(I1);
+        constexpr index_t NumDimN = CPermuteNumDims_G_M_Gemm1N::At(I2); // NumDimGemm1N
+        assert(c_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               c_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+        const auto c_gs_ms_ns_lengths =
+            to_tuple(c_gs_ms_ns_lengths_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto c_gs_ms_ns_strides =
+            to_tuple(c_gs_ms_ns_strides_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        // dimension Ids for G0, G1, ...
+        constexpr auto gDimIds = typename arithmetic_sequence_gen<0, NumDimG, 1>::type{};
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds =
+            typename arithmetic_sequence_gen<NumDimG, NumDimG + NumDimM, 1>::type{};
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<NumDimG + NumDimM,
+                                                                  NumDimG + NumDimM + NumDimN,
+                                                                  1>::type{};
+        // lengths for G0, G1, ...
+        const auto gLengths = get_container_subset(c_gs_ms_ns_lengths, gDimIds);
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(c_gs_ms_ns_lengths, mDimIds);
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(c_gs_ms_ns_lengths, nDimIds);
+        // naive tensor C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+        const auto c_grid_desc_gs_ms_ns =
+            make_naive_tensor_descriptor(c_gs_ms_ns_lengths, c_gs_ms_ns_strides);
+        // transformed tensor C[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 *
+        // N2 * ...]
+        const auto c_grid_desc_g_mraw_nraw =
+            transform_tensor_descriptor(c_grid_desc_gs_ms_ns,
+                                        make_tuple(make_merge_transform(gLengths),
+                                                   make_merge_transform(mLengths),
+                                                   make_merge_transform(nLengths)),
+                                        make_tuple(gDimIds, mDimIds, nDimIds),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+        // this desc is only for calculating batch offset so no padding needed
+        return c_grid_desc_g_mraw_nraw;
+    }
+    using AGridDesc_AK0_M_AK1  = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1  = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N({}, {}));
+    using CGridDesc_G_M_N      = decltype(MakeCGridDescriptor_G_M_N({}, {}));
+    // to track the points which need to be set to -inf on C0
+    // Note: no need to reset M padding value, because they will not be stored out.
+    struct C0MatrixMask
+    {
+        C0MatrixMask(index_t NRaw) : NRaw_(NRaw) {}
+        __host__ __device__ bool IsUpperTriangle(index_t m, index_t n) const { return n > m; }
+        __host__ __device__ bool IsNOutOfBound(/*index_t m, */ index_t n) const
+        {
+            return n >= NRaw_;
+        }
+        __host__ __device__ bool IsMaskedElement(index_t m, index_t n) const
+        {
+            return IsUpperTriangle(m, n) || IsNOutOfBound(n);
+        }
+        private:
+        // index_t MRaw_;
+        index_t NRaw_;
+    };
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideB1,
+                                     CGridDesc_G_M_N c_grid_desc_g_m_n)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideB1_(BatchStrideB1),
+              c_grid_desc_g_m_n_(c_grid_desc_g_m_n)
+        {
+        }
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return c_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideB1_;
+        CGridDesc_G_M_N c_grid_desc_g_m_n_;
+    };
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        B1GridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        AK1,
+        BK1,
+        B1K1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        Gemm1NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        true,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        true,
+        BBlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        matrix_padder.PadN,
+        MaskOutUpperTriangle>;
+    using Block2CTileMap = OffsettedBlockToCTileMap<typename GridwiseGemm::DefaultBlock2CTileMap>;
+    struct GroupKernelArg
+    {
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        const B1DataType* p_b1_grid_;
+        CDataType* p_c_grid_;
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        // batch & stride
+        index_t num_blocks_per_batch_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+        // check C0 masking and padding
+        C0MatrixMask c0_matrix_mask_;
+        // block-to-c-tile map
+        Block2CTileMap block_2_ctile_map_;
+        index_t block_start_, block_end_;
+    };
+    struct GroupDeviceArg
+    {
+        // problem definiton
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t O;
+        // Strides for the last dimensions of C for sanity check of vector load/store
+        index_t c_extent_lowest_;
+        index_t c_stride_lowest_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+    };
+    // Argument
+    // FIXME: constness
+    struct Argument : public BaseArgument
+    {
+        Argument(std::vector<const void*> p_a_vec,
+                 std::vector<const void*> p_b_vec,
+                 std::vector<const void*> p_b1_vec,
+                 std::vector<void*> p_c_vec,
+                 std::vector<ProblemDesc> problem_desc_vec,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              b1_element_op_{b1_element_op},
+              c_element_op_{c_element_op}
+        {
+            group_count_ = problem_desc_vec.size();
+            if(!(group_count_ == p_a_vec.size() && group_count_ == p_b_vec.size() &&
+                 group_count_ == p_b1_vec.size() && group_count_ == p_c_vec.size()))
+            {
+                throw std::runtime_error("wrong! group_count_ != a/b/b1/c_vec.size");
+            }
+            grid_size_ = 0;
+            for(std::size_t i = 0; i < group_count_; i++)
+            {
+                const auto p_a_grid  = static_cast<const ADataType*>(p_a_vec[i]);
+                const auto p_b_grid  = static_cast<const BDataType*>(p_b_vec[i]);
+                const auto p_b1_grid = static_cast<const B1DataType*>(p_b1_vec[i]);
+                const auto p_c_grid  = static_cast<CDataType*>(p_c_vec[i]);
+                const auto a_grid_desc_ak0_m_ak1 = DeviceOp::MakeAGridDescriptor_AK0_M_AK1(
+                    problem_desc_vec[i].M, problem_desc_vec[i].K, problem_desc_vec[i].StrideA);
+                const auto b_grid_desc_bk0_n_bk1 = DeviceOp::MakeBGridDescriptor_BK0_N_BK1(
+                    problem_desc_vec[i].K, problem_desc_vec[i].N, problem_desc_vec[i].StrideB0);
+                const auto b1_grid_desc_bk0_n_bk1 = DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(
+                    problem_desc_vec[i].N, problem_desc_vec[i].O, problem_desc_vec[i].StrideB1);
+                const auto c_grid_desc_m_n = DeviceOp::MakeCGridDescriptor_M_N(
+                    problem_desc_vec[i].c_gs_ms_os_lengths, problem_desc_vec[i].c_gs_ms_os_strides);
+                const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n);
+                const index_t BlockStart     = grid_size_;
+                const auto block_2_ctile_map = Block2CTileMap(c_grid_desc_m_n, BlockStart);
+                const index_t grid_size_grp = block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n) *
+                                              problem_desc_vec[i].Batch;
+                const index_t BlockEnd = grid_size_ + grid_size_grp;
+                // batch stride
+                // TODO ANT: only keep batch stride in tensor desc to reduce scalar cache pressure
+                const auto c_grid_desc_g_m_n = DeviceOp::MakeCGridDescriptor_G_M_N(
+                    problem_desc_vec[i].c_gs_ms_os_lengths, problem_desc_vec[i].c_gs_ms_os_strides);
+                const auto compute_base_ptr_of_batch =
+                    ComputeBasePtrOfStridedBatch(problem_desc_vec[i].BatchStrideA,
+                                                 problem_desc_vec[i].BatchStrideB0,
+                                                 problem_desc_vec[i].BatchStrideB1,
+                                                 c_grid_desc_g_m_n);
+                // C0 mask
+                const auto c0_matrix_mask = C0MatrixMask(problem_desc_vec[i].N);
+                grid_size_ += grid_size_grp;
+                group_kernel_args_.push_back({p_a_grid,
+                                              p_b_grid,
+                                              p_b1_grid,
+                                              p_c_grid,
+                                              a_grid_desc_ak0_m_ak1,
+                                              b_grid_desc_bk0_n_bk1,
+                                              b1_grid_desc_bk0_n_bk1,
+                                              c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n),
+                                              compute_base_ptr_of_batch,
+                                              c0_matrix_mask,
+                                              block_2_ctile_map,
+                                              BlockStart,
+                                              BlockEnd});
+                group_device_args_.push_back({problem_desc_vec[i].M,
+                                              problem_desc_vec[i].N,
+                                              problem_desc_vec[i].K,
+                                              problem_desc_vec[i].O,
+                                              problem_desc_vec[i].c_gs_ms_os_lengths.back(),
+                                              problem_desc_vec[i].c_gs_ms_os_strides.back(),
+                                              c_grid_desc_m_n});
+            }
+        }
+        std::vector<GroupKernelArg> group_kernel_args_;
+        std::vector<GroupDeviceArg> group_device_args_;
+        std::size_t group_count_;
+        index_t grid_size_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!DeviceOp::IsSupportedArgument(arg))
+            {
+                throw std::runtime_error("wrong! unsupported argument");
+            }
+            bool all_has_main_k_block_loop  = true;
+            bool some_has_main_k_block_loop = false;
+            for(std::size_t i = 0; i < arg.group_count_; i++)
+            {
+                const auto K = arg.group_kernel_args_[i].a_grid_desc_ak0_m_ak1_.GetLength(I0) *
+                               arg.group_kernel_args_[i].a_grid_desc_ak0_m_ak1_.GetLength(I2);
+                const bool y = GridwiseGemm::CalculateHasMainKBlockLoop(K);
+                all_has_main_k_block_loop &= y;
+                some_has_main_k_block_loop |= y;
+            }
+            hipGetErrorString(hipMemcpy(arg.p_workspace_,
+                                        arg.group_kernel_args_.data(),
+                                        arg.group_kernel_args_.size() * sizeof(GroupKernelArg),
+                                        hipMemcpyHostToDevice));
+            float ave_time = 0;
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel =
+                    kernel_grouped_gemm_softmax_gemm_xdl_cshuffle_v1<GridwiseGemm,
+                                                                     GroupKernelArg,
+                                                                     AElementwiseOperation,
+                                                                     BElementwiseOperation,
+                                                                     AccElementwiseOperation,
+                                                                     B1ElementwiseOperation,
+                                                                     CElementwiseOperation,
+                                                                     has_main_k_block_loop_>;
+                return launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(arg.grid_size_),
+                    dim3(BlockSize),
+                    0,
+                    cast_pointer_to_constant_address_space(arg.p_workspace_),
+                    arg.group_count_,
+                    arg.a_element_op_,
+                    arg.b_element_op_,
+                    arg.acc_element_op_,
+                    arg.b1_element_op_,
+                    arg.c_element_op_);
+            };
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(all_has_main_k_block_loop)
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else if(!some_has_main_k_block_loop)
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+            else
+            {
+                throw std::runtime_error("wrong! all gemm problems have to simultaneously meet "
+                                         "has_main_k_block_loop or no_main_k_block_loop");
+            }
+            return ave_time;
+        }
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+        bool all_has_main_k_block_loop  = true;
+        bool some_has_main_k_block_loop = false;
+        for(std::size_t i = 0; i < arg.group_count_; i++)
+        {
+            const auto& kernel_arg = arg.group_kernel_args_[i];
+            const auto& device_arg = arg.group_device_args_[i];
+            // Check if C permute dimension matches GEMM + GEMM shape
+            const index_t c_m       = device_arg.c_grid_desc_m_n_.GetLength(I0);
+            const index_t c_gemm1n  = device_arg.c_grid_desc_m_n_.GetLength(I1);
+            const index_t a_m       = kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I1);
+            const index_t b1_gemm1n = kernel_arg.b1_grid_desc_bk0_n_bk1_.GetLength(I1);
+            if(!(c_m == a_m && c_gemm1n == b1_gemm1n))
+            {
+                return false;
+            }
+            // Check if having main loop
+            const auto K = kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) *
+                           kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            const bool y = GridwiseGemm::CalculateHasMainKBlockLoop(K);
+            all_has_main_k_block_loop &= y;
+            some_has_main_k_block_loop |= y;
+            // Note: we need raw lengths since threadwise copy can not handle vector load when
+            // part of vector is out of bounds
+            const auto MRaw      = device_arg.M;
+            const auto NRaw      = device_arg.N;
+            const auto KRaw      = device_arg.K;
+            const auto Gemm1NRaw = device_arg.O;
+            // Check scalar per vector requirement
+            const auto a_extent_lowest =
+                is_same_v<tensor_layout::gemm::RowMajor, ALayout> ? KRaw : MRaw;
+            const auto b_extent_lowest =
+                is_same_v<tensor_layout::gemm::RowMajor, BLayout> ? NRaw : KRaw;
+            const auto b1_extent_lowest =
+                is_same_v<tensor_layout::gemm::RowMajor, B1Layout> ? Gemm1NRaw : NRaw;
+            const auto c_extent_lowest = device_arg.c_extent_lowest_;
+            if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+                 b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 &&
+                 b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+                 c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+            {
+                return false;
+            }
+            // Check vector store requirement; assumes last dimension in N to be contiguous
+            if(device_arg.c_stride_lowest_ != 1)
+            {
+                return false;
+            }
+            if(!GridwiseGemm::CheckValidity(kernel_arg.a_grid_desc_ak0_m_ak1_,
+                                            kernel_arg.b_grid_desc_bk0_n_bk1_,
+                                            kernel_arg.b1_grid_desc_bk0_n_bk1_,
+                                            device_arg.c_grid_desc_m_n_,
+                                            kernel_arg.block_2_ctile_map_))
+            {
+                return false;
+            }
+        }
+        // all gemm problems have to simultaneously meet has_main_k_block_loop or
+        // no_main_k_block_loop
+        if(!(all_has_main_k_block_loop || !some_has_main_k_block_loop))
+        {
+            return false;
+        }
+        return true;
+    }
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto MakeArgument(std::vector<const void*> p_a_vec,
+                             std::vector<const void*> p_b_vec,
+                             std::vector<const void*> p_b1_vec,
+                             std::vector<void*> p_c_vec,
+                             std::vector<ProblemDesc> problem_desc_vec,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a_vec,
+                        p_b_vec,
+                        p_b1_vec,
+                        p_c_vec,
+                        problem_desc_vec,
+                        a_element_op,
+                        b_element_op,
+                        acc_element_op,
+                        b1_element_op,
+                        c_element_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(std::vector<const void*> p_a_vec,
+                                                      std::vector<const void*> p_b_vec,
+                                                      std::vector<const void*> p_b1_vec,
+                                                      std::vector<void*> p_c_vec,
+                                                      std::vector<ProblemDesc> problem_desc_vec,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      AccElementwiseOperation acc_element_op,
+                                                      B1ElementwiseOperation b1_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(p_a_vec,
+                                          p_b_vec,
+                                          p_b1_vec,
+                                          p_c_vec,
+                                          problem_desc_vec,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          b1_element_op,
+                                          c_element_op);
+    }
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << Gemm1NPerBlock << ", "
+            << Gemm1KPerBlock << ", "
+            << B1K1 << ", "
+            << getGemmSpecializationString(GemmSpec) << ">";
+        // clang-format on
+        return str.str();
+    }
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GroupKernelArg);
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -238,10 +238,6 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
        BElementwiseOperation,
        CDEElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        AGridDesc_M_K,
-        BGridDesc_N_K,
-        DsGridDesc_M_N,
-        EGridDesc_M_N,
        NumPrefetch, // NumGemmKPrefetchStage
        BlockSize,
        MPerBlock,
@@ -275,19 +271,19 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
        CDEBlockTransferScalarPerVector_NPerBlock,
        LoopSched>;
-    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
-    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
    struct GroupedGemmBlock2ETileMap
    {
-        using UnderlyingBlock2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+        using Block2ETileMap =
+            remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
-        static_assert(
-            std::is_same<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{})),
-                         typename GridwiseGemm::DefaultBlock2ETileMap>::value,
-            "Wrong! Should be the same type name");
        GroupedGemmBlock2ETileMap()
        {
@@ -321,7 +317,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
            return block_2_etile_map_.CheckValidity(e_grid_desc_m_n);
        }
-        typename GridwiseGemm::DefaultBlock2ETileMap block_2_etile_map_;
+        Block2ETileMap block_2_etile_map_;
        ck::index_t BlockStart_;
    };
@@ -342,10 +338,9 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
        // tensor descriptors for block/thread-wise copy
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
-        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
-        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
-            e_grid_desc_mblock_mperblock_nblock_nperblock_;
        // block-to-e-tile map
        GroupedGemmBlock2ETileMap block_2_etile_map_;
@@ -440,7 +435,7 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                                               block_2_etile_map))
                {
                    // tensor descriptors for block/thread-wise copy
-                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                    DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                        ds_grid_desc_mblock_mperblock_nblock_nperblock;
                    static_for<0, NumDTensor, 1>{}([&](auto j) {

--- a/include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp
@@ -23,11 +23,10 @@ template <typename GridwiseReduction,
          typename YDataType,
          typename AccDataType,
          typename AccElementwiseOperation,
-          typename GridDesc_M_K,
+          typename GridDesc_M_K>
-          typename GridDesc_K>
 __global__ void kernel_layernorm(const GridDesc_M_K x_grid_desc_m_k,
-                                 const GridDesc_K gamma_grid_desc_k,
+                                 const GridDesc_M_K gamma_grid_desc_m_k,
-                                 const GridDesc_K beta_grid_desc_k,
+                                 const GridDesc_M_K beta_grid_desc_m_k,
                                 const GridDesc_M_K y_grid_desc_m_k,
                                 index_t num_k_block_tile_iteration,
                                 AccDataType epsilon,
@@ -38,8 +37,8 @@ __global__ void kernel_layernorm(const GridDesc_M_K x_grid_desc_m_k,
                                 const AccElementwiseOperation acc_elementwise_op)
 {
    GridwiseReduction::Run(x_grid_desc_m_k,
-                           gamma_grid_desc_k,
+                           gamma_grid_desc_m_k,
-                           beta_grid_desc_k,
+                           beta_grid_desc_m_k,
                           y_grid_desc_m_k,
                           num_k_block_tile_iteration,
                           epsilon,
@@ -71,7 +70,9 @@ template <typename XDataType,
          index_t KThreadSliceSize,
          index_t XYSrcVectorDim,
          index_t XSrcVectorSize,
+          index_t GammaSrcVectorDim,
          index_t GammaSrcVectorSize,
+          index_t BetaSrcVectorDim,
          index_t BetaSrcVectorSize,
          index_t YDstVectorSize>
 struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
@@ -84,11 +85,13 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
                                                    NumReduceDim>
 {
    static_assert(
-        (KThreadSliceSize % GammaSrcVectorSize == 0),
+        ((GammaSrcVectorDim == 0 && MThreadSliceSize % GammaSrcVectorSize == 0) ||
+         (GammaSrcVectorDim == 1 && KThreadSliceSize % GammaSrcVectorSize == 0)),
        "Invalid thread slice sizes and/or gamma vector sizes configuration, please check!");
    static_assert(
-        (KThreadSliceSize % BetaSrcVectorSize == 0),
+        ((BetaSrcVectorDim == 0 && MThreadSliceSize % BetaSrcVectorSize == 0) ||
+         (BetaSrcVectorDim == 1 && KThreadSliceSize % BetaSrcVectorSize == 0)),
        "Invalid thread slice sizes and/or beta vector sizes configuration, please check!");
    using PassThrough = tensor_operation::element_wise::PassThrough;
@@ -162,38 +165,7 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
        return (in_grid_desc_m_k_padded);
    };
-    static auto MakeAffine1dDescriptor(const std::vector<index_t>& Lengths,
-                                       const std::vector<index_t>& Strides,
-                                       int blkGroupSize,
-                                       int numBlockTileIteration)
-    {
-        const auto tupleLengths = make_tuple_from_array(Lengths, Number<NumReduceDim>{});
-        const auto tupleStrides = make_tuple_from_array(Strides, Number<NumReduceDim>{});
-        auto desc = make_naive_tensor_descriptor(tupleLengths, tupleStrides);
-        auto grid_desc_k = transform_tensor_descriptor(
-            desc,
-            make_tuple(make_merge_transform(tupleLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, NumReduceDim, 1>::type{}),
-            make_tuple(Sequence<0>{}));
-        const auto reduceTotalLength = grid_desc_k.GetLength(Number<0>{});
-        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
-        const auto Pad_K = reduceSizePerBlock * blkGroupSize - reduceTotalLength;
-        auto grid_desc_k_padded = transform_tensor_descriptor(
-            grid_desc_k,
-            make_tuple(make_right_pad_transform(reduceTotalLength, Pad_K)),
-            make_tuple(Sequence<0>{}),
-            make_tuple(Sequence<0>{}));
-        return (grid_desc_k_padded);
-    };
    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
-    using GridDesc_K   = decltype(MakeAffine1dDescriptor({1}, {1}, 1, 1));
    using GridwiseReduceLayernormGeneric =
        GridwiseLayernormWelfordVariance_mk_to_mk<XDataType,
@@ -203,7 +175,6 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
                                                  AccDataType,
                                                  AccElementwiseOperation,
                                                  GridDesc_M_K,
-                                                  GridDesc_K,
                                                  BlockSize,
                                                  MThreadClusterSize,
                                                  KThreadClusterSize,
@@ -211,12 +182,13 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
                                                  KThreadSliceSize,
                                                  XYSrcVectorDim,
                                                  XSrcVectorSize,
+                                                  GammaSrcVectorDim,
                                                  GammaSrcVectorSize,
+                                                  BetaSrcVectorDim,
                                                  BetaSrcVectorSize,
                                                  XYSrcVectorDim,
                                                  YDstVectorSize,
                                                  false>;
    using GridwiseReduceLayernormSweepOnce =
        GridwiseLayernormWelfordVariance_mk_to_mk<XDataType,
                                                  GammaDataType,
@@ -225,7 +197,6 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
                                                  AccDataType,
                                                  AccElementwiseOperation,
                                                  GridDesc_M_K,
-                                                  GridDesc_K,
                                                  BlockSize,
                                                  MThreadClusterSize,
                                                  KThreadClusterSize,
@@ -233,7 +204,9 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
                                                  KThreadSliceSize,
                                                  XYSrcVectorDim,
                                                  XSrcVectorSize,
+                                                  GammaSrcVectorDim,
                                                  GammaSrcVectorSize,
+                                                  BetaSrcVectorDim,
                                                  BetaSrcVectorSize,
                                                  XYSrcVectorDim,
                                                  YDstVectorSize,
@@ -258,13 +231,13 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
              p_gamma_(p_gamma),
              p_beta_(p_beta),
              p_y_(p_y),
-              gammaStrides_(gammaStrides),
-              betaStrides_(betaStrides),
              acc_elementwise_op_(acc_elementwise_op)
        {
-            Lengths_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(lengths, reduceDims);
+            Lengths_      = shuffle_tensor_dimensions<Rank, NumReduceDim>(lengths, reduceDims);
-            xStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(xStrides, reduceDims);
+            xStrides_     = shuffle_tensor_dimensions<Rank, NumReduceDim>(xStrides, reduceDims);
-            yStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(yStrides, reduceDims);
+            yStrides_     = shuffle_tensor_dimensions<Rank, NumReduceDim>(yStrides, reduceDims);
+            gammaStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(gammaStrides, reduceDims);
+            betaStrides_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(betaStrides, reduceDims);
            long_index_t invariant_total_length;
            long_index_t reduce_total_length;
@@ -278,12 +251,17 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
            gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
                        M_BlockTileSize * blkGroupSize_;
-            reduceLengths_.resize(NumReduceDim);
+            x_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, xStrides_, blkGroupSize_, numBlockTileIteration_);
-            for(int i = 0; i < NumReduceDim; ++i)
+            gamma_grid_desc_m_k_ =
-            {
+                MakeSrc2dDescriptor(Lengths_, gammaStrides_, blkGroupSize_, numBlockTileIteration_);
-                reduceLengths_[i] = lengths[reduceDims[i]];
+            beta_grid_desc_m_k_ =
-            }
+                MakeSrc2dDescriptor(Lengths_, betaStrides_, blkGroupSize_, numBlockTileIteration_);
+            y_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, yStrides_, blkGroupSize_, numBlockTileIteration_);
+            isSweeponce_ =
+                x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
        }
        AccDataType epsilon_;
@@ -295,7 +273,6 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
        std::vector<index_t> Lengths_;
        std::vector<index_t> xStrides_;
-        std::vector<index_t> reduceLengths_;
        std::vector<index_t> gammaStrides_;
        std::vector<index_t> betaStrides_;
        std::vector<index_t> yStrides_;
@@ -305,46 +282,35 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
        int blkGroupSize_;
        int numBlockTileIteration_;
        size_t gridSize_;
+        GridDesc_M_K x_grid_desc_m_k_;
+        GridDesc_M_K gamma_grid_desc_m_k_;
+        GridDesc_M_K beta_grid_desc_m_k_;
+        GridDesc_M_K y_grid_desc_m_k_;
+        bool isSweeponce_;
    };
    struct Invoker : public BaseInvoker
    {
        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            const auto x_grid_desc_m_k = MakeSrc2dDescriptor(
+            const auto kernel_main = arg.isSweeponce_
-                arg.Lengths_, arg.xStrides_, arg.blkGroupSize_, arg.numBlockTileIteration_);
+                                         ? kernel_layernorm<GridwiseReduceLayernormSweepOnce,
-            const auto gamma_grid_desc_k = MakeAffine1dDescriptor(arg.reduceLengths_,
+                                                            XDataType,
-                                                                  arg.gammaStrides_,
+                                                            GammaDataType,
-                                                                  arg.blkGroupSize_,
+                                                            BetaDataType,
-                                                                  arg.numBlockTileIteration_);
+                                                            YDataType,
-            const auto beta_grid_desc_k  = MakeAffine1dDescriptor(arg.reduceLengths_,
+                                                            AccDataType,
-                                                                 arg.betaStrides_,
+                                                            AccElementwiseOperation,
-                                                                 arg.blkGroupSize_,
+                                                            GridDesc_M_K>
-                                                                 arg.numBlockTileIteration_);
+                                         : kernel_layernorm<GridwiseReduceLayernormGeneric,
-            const auto y_grid_desc_m_k   = MakeSrc2dDescriptor(
+                                                            XDataType,
-                arg.Lengths_, arg.yStrides_, arg.blkGroupSize_, arg.numBlockTileIteration_);
+                                                            GammaDataType,
+                                                            BetaDataType,
-            bool sweep_once =
+                                                            YDataType,
-                x_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
+                                                            AccDataType,
+                                                            AccElementwiseOperation,
-            const auto kernel_main = sweep_once ? kernel_layernorm<GridwiseReduceLayernormSweepOnce,
+                                                            GridDesc_M_K>;
-                                                                   XDataType,
-                                                                   GammaDataType,
-                                                                   BetaDataType,
-                                                                   YDataType,
-                                                                   AccDataType,
-                                                                   AccElementwiseOperation,
-                                                                   GridDesc_M_K,
-                                                                   GridDesc_K>
-                                                : kernel_layernorm<GridwiseReduceLayernormGeneric,
-                                                                   XDataType,
-                                                                   GammaDataType,
-                                                                   BetaDataType,
-                                                                   YDataType,
-                                                                   AccDataType,
-                                                                   AccElementwiseOperation,
-                                                                   GridDesc_M_K,
-                                                                   GridDesc_K>;
            float avg_time = 0;
            avg_time += launch_and_time_kernel(stream_config,
@@ -352,10 +318,10 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
                                               dim3(arg.gridSize_),
                                               dim3(BlockSize),
                                               0,
-                                               x_grid_desc_m_k,
+                                               arg.x_grid_desc_m_k_,
-                                               gamma_grid_desc_k,
+                                               arg.gamma_grid_desc_m_k_,
-                                               beta_grid_desc_k,
+                                               arg.beta_grid_desc_m_k_,
-                                               y_grid_desc_m_k,
+                                               arg.y_grid_desc_m_k_,
                                               arg.numBlockTileIteration_,
                                               arg.epsilon_,
                                               arg.p_x_,
@@ -409,26 +375,41 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
            return false;
        }
-        if(p_arg_->gammaStrides_.size() != NumReduceDim ||
+        // if fastest dim is not reduced
-           p_arg_->betaStrides_.size() != NumReduceDim)
+        if constexpr(GammaSrcVectorDim == 0)
-            return false;
+        {
+            if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1)
+                return (false);
-        auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) {
+            if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
-            bool ret = true;
+                return (false);
+        }
+        else // if fastest dim is reduced
+        {
+            if(p_arg_->gammaStrides_[Rank - 1] != 1)
+                return (false);
-            if(!isLastDimensionCoalesced)
+            if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
-                ret = scalarPerVector == 1;
+                return (false);
-            else
+        }
-                ret = KThreadSliceSize % scalarPerVector == 0;
-            return ret;
+        // if fastest dim is not reduced
-        };
+        if constexpr(BetaSrcVectorDim == 0)
+        {
+            if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1)
+                return (false);
-        if(!IsScalarPerVectorValid(p_arg_->gammaStrides_.back() == 1, GammaSrcVectorSize))
+            if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0)
-            return false;
+                return (false);
+        }
+        else // if fastest dim is reduced
+        {
+            if(p_arg_->betaStrides_[Rank - 1] != 1)
+                return (false);
-        if(!IsScalarPerVectorValid(p_arg_->betaStrides_.back() == 1, BetaSrcVectorSize))
+            if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0)
-            return false;
+                return (false);
+        }
        return true;
    };

--- a/include/ck/tensor_operation/gpu/device/device_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_permute.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include <cmath>
+#include <memory>
+#include <type_traits>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <index_t NumDim, typename InDataType, typename OutDataType, typename ElementwiseOperation>
+struct DevicePermute : BaseOperator
+{
+    using Lengths = std::array<index_t, NumDim>;
+    using Strides = Lengths;
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const Lengths& in_lengths,
+                        const Strides& in_strides,
+                        const Lengths& out_lengths,
+                        const Strides& out_strides,
+                        const void* in_dev_buffer,
+                        void* out_dev_buffer,
+                        ElementwiseOperation elementwise_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace {
+template <index_t NumDTensor>
+struct ComputePtrOffsetOfStridedBatch
+{
+    ComputePtrOffsetOfStridedBatch() = default;
+    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                   index_t BatchStrideB,
+                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
+                                   index_t BatchStrideE)
+        : BatchStrideA_(BatchStrideA),
+          BatchStrideB_(BatchStrideB),
+          BatchStrideDs_(BatchStrideDs),
+          BatchStrideE_(BatchStrideE)
+    {
+    }
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+    {
+        Array<long_index_t, NumDTensor> ds_offset;
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
+        return ds_offset;
+    }
+    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideE_);
+    }
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    Array<ck::index_t, NumDTensor> BatchStrideDs_;
+    index_t BatchStrideE_;
+};
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const index_t batch_count,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            const Block2ETileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    // offset base pointer for each work-group
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    DsPointer p_ds_grid_grp;
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+#endif
+}
+} // namespace
+// Conv backward data multiple D:
+//   input : output image A: [G, N, K, Ho, Wo]
+//   input : weight B: [G, K, C, Y, X],
+//   input : D0, D1, ... : [G, N, K, Ho, Wo]
+//   output : input image E: [G, N, C, Hi, Wi]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <index_t NDimSpatial,
+          typename ALayout,   // output image
+          typename BLayout,   // weight
+          typename DsLayout,  // bias
+          typename ELayout,   // input image
+          typename ADataType, // output image
+          typename BDataType, // weight
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,       // bias
+          typename EDataType,        // input image
+          typename AElementwiseOp,   // output image
+          typename BElementwiseOp,   // weight
+          typename CDEElementwiseOp, // C, bias, and input image
+          ConvolutionBackwardDataSpecialization ConvBackwardDataSpecialization,
+          bool DoPadGemmM,
+          bool DoPadGemmN,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
+    : public DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
+                                               ALayout,    // output image
+                                               BLayout,    // weight
+                                               DsLayout,   // bias
+                                               ELayout,    // input image
+                                               ADataType,  // output image
+                                               BDataType,  // weight
+                                               DsDataType, // bias
+                                               EDataType,  // input image
+                                               AElementwiseOp,
+                                               BElementwiseOp,
+                                               CDEElementwiseOp>
+{
+    // FIXME
+    static_assert(NDimSpatial == 2, "wrong! only implemented for 2D now");
+    using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    // TODO make A/B datatype different
+    using ABDataType = ADataType;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto transform_conv_to_gemm =
+        TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                      ConvBackwardDataSpecialization,
+                                      AK1,
+                                      BK1,
+                                      MPerBlock,
+                                      NPerBlock,
+                                      DoPadGemmM,
+                                      DoPadGemmN>{};
+    static auto GetDummyABDsEGridDescriptor()
+    {
+        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
+        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
+        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
+        const auto a_grid_desc_ak0_m_ak1 =
+            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths);
+        const auto b_grid_desc_bk0_n_bk1 =
+            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths);
+        const auto ds_grid_desc_m_n = generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths);
+            },
+            Number<NumDTensor>{});
+        const auto e_grid_desc_m_n =
+            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths);
+        return make_tuple(
+            a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
+    }
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ABDataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOp,
+        BElementwiseOp,
+        CDEElementwiseOp,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+    template <typename Desc_K0_M_K1>
+    static auto transform_k0_m_k1_to_m_k(const Desc_K0_M_K1& desc_k0_m_k1)
+    {
+        const auto grid_desc_m_k = transform_tensor_descriptor(
+            desc_k0_m_k1,
+            make_tuple(make_pass_through_transform(desc_k0_m_k1.GetLength(I1)),
+                       make_merge_transform(
+                           make_tuple(desc_k0_m_k1.GetLength(I0), desc_k0_m_k1.GetLength(I2)))),
+            make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return grid_desc_m_k;
+    }
+    // desc
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
+    using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
+    using DsGridDesc_M_N      = remove_cvref_t<tuple_element_t<2, ABDsEGridDesc>>;
+    using EGridDesc_M_N       = remove_cvref_t<tuple_element_t<3, ABDsEGridDesc>>;
+    using AGridDesc_M_K = decltype(transform_k0_m_k1_to_m_k(AGridDesc_AK0_M_AK1{}));
+    using BGridDesc_N_K = decltype(transform_k0_m_k1_to_m_k(BGridDesc_BK0_N_BK1{}));
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}));
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a,                                 // output image
+                 const void* p_b,                                 // weight
+                 const std::array<const void*, NumDTensor>& p_ds, // bias
+                 void* p_e,                                       // input image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_lengths,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOp& a_element_op,
+                 const BElementwiseOp& b_element_op,
+                 const CDEElementwiseOp& cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a)},
+              p_b_grid_{static_cast<const BDataType*>(p_b)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e)},
+              num_group_{a_g_n_k_wos_lengths[0]},
+              num_gemm_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
+              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
+              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
+              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
+              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
+              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            // populate Ds pointer
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
+            });
+            // A/B/Ds/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_c_wis_strides[i][0];
+            });
+            // problem definition
+            const index_t Y = b_g_k_c_xs_lengths[3];
+            const index_t X = b_g_k_c_xs_lengths[4];
+            const index_t ConvStrideH = conv_filter_strides_[0];
+            const index_t ConvStrideW = conv_filter_strides_[1];
+            const index_t ConvDilationH = conv_filter_dilations_[0];
+            const index_t ConvDilationW = conv_filter_dilations_[1];
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+            // number of GEMM
+            num_gemm_ = YTilde * XTilde;
+            for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+            {
+                for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                {
+                    // check slice is valid
+                    const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+                    const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+                    if(YDotSlice * XDotSlice <= 0)
+                    {
+                        continue;
+                    }
+                    const auto a_grid_desc_ak0_m_ak1 =
+                        transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
+                            a_g_n_k_wos_lengths,
+                            a_g_n_k_wos_strides,
+                            b_g_k_c_xs_lengths,
+                            b_g_k_c_xs_strides,
+                            e_g_n_c_wis_lengths,
+                            e_g_n_c_wis_strides,
+                            conv_filter_strides,
+                            conv_filter_dilations,
+                            input_left_pads,
+                            input_right_pads,
+                            {i_ytilde, i_xtilde});
+                    const auto b_grid_desc_bk0_n_bk1 =
+                        transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
+                            a_g_n_k_wos_lengths,
+                            a_g_n_k_wos_strides,
+                            b_g_k_c_xs_lengths,
+                            b_g_k_c_xs_strides,
+                            e_g_n_c_wis_lengths,
+                            e_g_n_c_wis_strides,
+                            conv_filter_strides,
+                            conv_filter_dilations,
+                            input_left_pads,
+                            input_right_pads,
+                            {i_ytilde, i_xtilde});
+                    DsGridDesc_M_N ds_grid_desc_m_n;
+                    // populate Ds desc
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                        ds_grid_desc_m_n(i) =
+                            transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
+                                a_g_n_k_wos_lengths,
+                                a_g_n_k_wos_strides,
+                                b_g_k_c_xs_lengths,
+                                b_g_k_c_xs_strides,
+                                ds_g_n_c_wis_lengths[i],
+                                ds_g_n_c_wis_strides[i],
+                                conv_filter_strides,
+                                conv_filter_dilations,
+                                input_left_pads,
+                                input_right_pads,
+                                {i_ytilde, i_xtilde});
+                    });
+                    const auto e_grid_desc_m_n =
+                        transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                            a_g_n_k_wos_lengths,
+                            a_g_n_k_wos_strides,
+                            b_g_k_c_xs_lengths,
+                            b_g_k_c_xs_strides,
+                            e_g_n_c_wis_lengths,
+                            e_g_n_c_wis_strides,
+                            conv_filter_strides,
+                            conv_filter_dilations,
+                            input_left_pads,
+                            input_right_pads,
+                            {i_ytilde, i_xtilde});
+                    // desc for problem definition
+                    const auto a_grid_desc_m_k = transform_k0_m_k1_to_m_k(a_grid_desc_ak0_m_ak1);
+                    const auto b_grid_desc_n_k = transform_k0_m_k1_to_m_k(b_grid_desc_bk0_n_bk1);
+                    a_grid_desc_m_k_container_.push_back(a_grid_desc_m_k);
+                    b_grid_desc_n_k_container_.push_back(b_grid_desc_n_k);
+                    ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
+                    e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
+                    // desc for blockwise copy
+                    a_grid_desc_ak0_m_ak1_container_.push_back(a_grid_desc_ak0_m_ak1);
+                    b_grid_desc_bk0_n_bk1_container_.push_back(b_grid_desc_bk0_n_bk1);
+                    // block-to-e-tile-map
+                    auto block_2_etile_map =
+                        GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+                    block_2_etile_map_container_.push_back(block_2_etile_map);
+                    if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                                   b_grid_desc_n_k,
+                                                   ds_grid_desc_m_n,
+                                                   e_grid_desc_m_n,
+                                                   block_2_etile_map))
+                    {
+                        ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+                            GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                ds_grid_desc_m_n));
+                        e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+                            GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                e_grid_desc_m_n));
+                    }
+                }
+            }
+        }
+        void Print() const
+        {
+            for(index_t i = 0; i < num_gemm_; i++)
+            {
+                std::cout << "a_grid_desc_ak0_m_ak1_container_"
+                          << a_grid_desc_ak0_m_ak1_container_[i] << std::endl;
+                std::cout << "b_grid_desc_bk0_n_bk1_container_"
+                          << b_grid_desc_bk0_n_bk1_container_[i] << std::endl;
+                static_for<0, NumDTensor, 1>{}([&](auto j) {
+                    std::cout << "ds_grid_desc_mblock_mperblock_nblock_nperblock_container_"
+                              << ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i][j]
+                              << std::endl;
+                });
+                std::cout << "e_grid_desc_mblock_mperblock_nblock_nperblock_container_"
+                          << e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i]
+                          << std::endl;
+            }
+        }
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+        // tensor descriptor for problem definition
+        index_t num_group_;
+        index_t num_gemm_;
+        std::vector<AGridDesc_M_K> a_grid_desc_m_k_container_;
+        std::vector<BGridDesc_N_K> b_grid_desc_n_k_container_;
+        std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
+        std::vector<EGridDesc_M_N> e_grid_desc_m_n_container_;
+        // tensor descriptor for block-wise copy
+        std::vector<AGridDesc_AK0_M_AK1> a_grid_desc_ak0_m_ak1_container_;
+        std::vector<BGridDesc_BK0_N_BK1> b_grid_desc_bk0_n_bk1_container_;
+        std::vector<DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_container_;
+        std::vector<EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
+            e_grid_desc_mblock_mperblock_nblock_nperblock_container_;
+        // block-to-e-tile map
+        std::vector<Block2ETileMap> block_2_etile_map_container_;
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
+        // element-wise op
+        AElementwiseOp a_element_op_;
+        BElementwiseOp b_element_op_;
+        CDEElementwiseOp cde_element_op_;
+        // for checking IsSupportedArgument()
+        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
+        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_dilations_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+    };
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+            float ave_time = 0;
+            for(index_t i = 0; i < arg.num_gemm_; i++)
+            {
+                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
+                                                arg.b_grid_desc_n_k_container_[i],
+                                                arg.ds_grid_desc_m_n_container_[i],
+                                                arg.e_grid_desc_m_n_container_[i],
+                                                arg.block_2_etile_map_container_[i]))
+                {
+                    throw std::runtime_error("wrong! device_op has invalid setting");
+                }
+                const index_t grid_size = arg.block_2_etile_map_container_[i].CalculateGridSize(
+                                              arg.e_grid_desc_m_n_container_[i]) *
+                                          arg.num_group_;
+                const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1);
+                auto launch_kernel = [&](auto has_main_k_block_loop) {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
+                    const auto kernel = kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        typename GridwiseGemm::DsGridPointer,
+                        EDataType,
+                        AElementwiseOp,
+                        BElementwiseOp,
+                        CDEElementwiseOp,
+                        DeviceOp::AGridDesc_AK0_M_AK1,
+                        DeviceOp::BGridDesc_BK0_N_BK1,
+                        DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                        Block2ETileMap,
+                        ComputePtrOffsetOfStridedBatch<NumDTensor>,
+                        has_main_loop>;
+                    return launch_and_time_kernel(
+                        stream_config,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        arg.p_a_grid_,
+                        arg.p_b_grid_,
+                        arg.p_ds_grid_,
+                        arg.p_e_grid_,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.cde_element_op_,
+                        arg.a_g_n_k_wos_lengths_[0], // Group count
+                        arg.a_grid_desc_ak0_m_ak1_container_[i],
+                        arg.b_grid_desc_bk0_n_bk1_container_[i],
+                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                        arg.block_2_etile_map_container_[i],
+                        arg.compute_ptr_offset_of_batch_);
+                };
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK))
+                {
+                    ave_time += launch_kernel(integral_constant<bool, true>{});
+                }
+                else
+                {
+                    ave_time += launch_kernel(integral_constant<bool, false>{});
+                }
+            }
+            return ave_time;
+        }
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        const index_t ConvK = arg.b_g_k_c_xs_lengths_[1];
+        const index_t ConvC = arg.b_g_k_c_xs_lengths_[2];
+        // Specifialization
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 pad = 0 conv
+            for(int i = 0; i < NDimSpatial; i++)
+            {
+                if(!(arg.b_g_k_c_xs_lengths_[3 + i] == 1 && arg.conv_filter_strides_[i] == 1 &&
+                     arg.input_left_pads_[i] == 0 && arg.input_right_pads_[i] == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        // vector load for A matrix from global memory to LDS
+        if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>)
+        {
+            if(!(ABlockTransferSrcVectorDim == 2 && ConvK % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+        // vector load for B matrix from global memory to LDS
+        if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC>)
+        {
+            if(!(BBlockTransferSrcVectorDim == 1 && ConvC % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+        // vector store for Ds
+        bool ds_valid = true;
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+            if constexpr(is_same_v<DLayout, tensor_layout::convolution::GNHWC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::NHWGC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::G_NHW_C> ||
+                         is_same_v<DLayout, tensor_layout::convolution::GC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::G_C>)
+            {
+                // vector load D matrix from global memory
+                if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                {
+                    ds_valid = false;
+                }
+            }
+            else
+            {
+                ds_valid = false;
+            }
+        });
+        if(!ds_valid)
+        {
+            return false;
+        }
+        // vector store for E
+        if constexpr(is_same_v<ELayout, tensor_layout::convolution::GNHWC>)
+        {
+            // vector store C matrix into global memory
+            if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+        // Gridwise GEMM size
+        for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
+                                            arg.b_grid_desc_n_k_container_[i],
+                                            arg.ds_grid_desc_m_n_container_[i],
+                                            arg.e_grid_desc_m_n_container_[i],
+                                            arg.block_2_etile_map_container_[i]))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto
+    MakeArgument(const void* p_a,                                                 // output image
+                 const void* p_b,                                                 // weight
+                 const std::array<const void*, NumDTensor>& p_ds,                 // bias
+                 void* p_e,                                                       // input image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_lengths, // bias
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_strides,                                        // bias
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOp& a_element_op,
+                 const BElementwiseOp& b_element_op,
+                 const CDEElementwiseOp& cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_g_n_k_wos_lengths,
+                        a_g_n_k_wos_strides,
+                        b_g_k_c_xs_lengths,
+                        b_g_k_c_xs_strides,
+                        ds_g_n_c_wis_lengths,
+                        ds_g_n_c_wis_strides,
+                        e_g_n_c_wis_lengths,
+                        e_g_n_c_wis_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,                                                 // output image
+        const void* p_b,                                                 // weight
+        const std::array<const void*, NumDTensor>& p_ds,                 // bias
+        void* p_e,                                                       // input image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_c_wis_lengths, // bias
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_c_wis_strides,                                        // bias
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOp& a_element_op,
+        const BElementwiseOp& b_element_op,
+        const CDEElementwiseOp& cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_g_n_k_wos_lengths,
+                                          a_g_n_k_wos_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          ds_g_n_c_wis_lengths,
+                                          ds_g_n_c_wis_strides,
+                                          e_g_n_c_wis_lengths,
+                                          e_g_n_c_wis_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << getConvBackwardDataSpecializationString(ConvBackwardDataSpecialization)
+            << ">";
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include <memory>
+#include <utility>
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_permute.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_permute.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// Swap last 2 dimensions
+// input shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2], d[NumDim-1]]
+//                                                                ^^^^^^^^^^^
+// output shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-1], d[NumDim-2]]
+//                                                    ^^^^^^^^^^^
+template <index_t NumDim,
+          typename InDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          index_t BlockSize,
+          index_t NPerBlock,
+          index_t HPerBlock,
+          index_t WPerBlock,
+          index_t InBlockLdsExtraW,
+          typename InBlockTransferThreadClusterLengths,
+          typename InBlockTransferThreadClusterArrangeOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector>
+struct DevicePermuteImpl : DevicePermute<NumDim, InDataType, OutDataType, ElementwiseOperation>
+{
+    using BaseType = DevicePermute<NumDim, InDataType, OutDataType, ElementwiseOperation>;
+    using typename BaseType::Lengths;
+    using typename BaseType::Strides;
+    static_assert(3 <= NumDim, "Only accept at least 3D dimension tensor");
+    static_assert((NumDim - 2) <= SrcVectorDim && SrcVectorDim < NumDim);
+    static_assert((NumDim - 2) <= DstVectorDim && DstVectorDim < NumDim);
+    static_assert(SrcVectorDim != DstVectorDim);
+    template <index_t N = NumDim>
+    static auto ConvertArrayToTuple(const std::array<index_t, NumDim>& array)
+    {
+        static_assert(1 <= N && N <= NumDim);
+        return generate_tuple([&](auto I) { return array[I]; }, Number<N>{});
+    }
+    static auto MakeDescriptor_N_H_W(const Lengths& lengths, const Strides& stride)
+    {
+        // create nd descriptor, shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2],
+        // d[NumDim-1]]
+        const auto desc =
+            make_naive_tensor_descriptor(ConvertArrayToTuple(lengths), ConvertArrayToTuple(stride));
+        // merge nd to 3d descriptor, shape: [(d[0] * d[1] * d[2] * ... * d[NumDim-3]), d[NumDim-2],
+        // d[NumDim-1]]
+        //                                   => [N, H, W]
+        const index_t H       = *std::next(rbegin(lengths));
+        const index_t W       = *rbegin(lengths);
+        const auto desc_n_h_w = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_merge_transform(ConvertArrayToTuple<NumDim - 2>(lengths)),
+                       make_pass_through_transform(H),
+                       make_pass_through_transform(W)),
+            make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NumDim - 2>{}),
+                       Sequence<NumDim - 2>{},
+                       Sequence<NumDim - 1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+        return PadTensorDescriptor(
+            desc_n_h_w, make_tuple(NPerBlock, HPerBlock, WPerBlock), Sequence<true, true, true>{});
+    }
+    using InGridDesc  = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}));
+    using OutGridDesc = InGridDesc;
+    using GridwisePermute = GridwisePermute<
+        InGridDesc,
+        OutGridDesc,
+        InDataType,
+        OutDataType,
+        ElementwiseOperation,
+        BlockSize,
+        NPerBlock,
+        HPerBlock,
+        WPerBlock,
+        InBlockLdsExtraW,
+        InBlockTransferThreadClusterLengths,
+        InBlockTransferThreadClusterArrangeOrder,
+        SrcVectorDim - (NumDim - 3), // calculate new SrcVectorDim for the merged descriptor
+        DstVectorDim - (NumDim - 3), // calculate new DstVectorDim for the merged descriptor
+        SrcScalarPerVector,
+        DstScalarPerVector>;
+    using Block2TileMap = typename GridwisePermute::DefaultBlock2TileMap;
+    struct Argument : public BaseArgument
+    {
+        Argument(const Lengths& in_lengths,
+                 const Strides& in_strides,
+                 const Lengths& out_lengths,
+                 const Strides& out_strides,
+                 const void* in_dev_buffer,
+                 void* out_dev_buffer,
+                 ElementwiseOperation elementwise_op)
+            : in_dev_buffer_(static_cast<const InDataType*>(in_dev_buffer)),
+              out_dev_buffer_(static_cast<OutDataType*>(out_dev_buffer)),
+              in_grid_desc_(MakeDescriptor_N_H_W(in_lengths, in_strides)),
+              out_grid_desc_(MakeDescriptor_N_H_W(out_lengths, out_strides)),
+              in_lengths_(in_lengths),
+              in_strides_(in_strides),
+              out_lengths_(out_lengths),
+              out_strides_(out_strides),
+              elementwise_op_(elementwise_op),
+              block_2_tile_map_(GridwisePermute::MakeDefaultBlock2TileMap(in_grid_desc_))
+        {
+        }
+        const InDataType* in_dev_buffer_;
+        OutDataType* out_dev_buffer_;
+        InGridDesc in_grid_desc_;
+        OutGridDesc out_grid_desc_;
+        Lengths in_lengths_;
+        Strides in_strides_;
+        Lengths out_lengths_;
+        Strides out_strides_;
+        ElementwiseOperation elementwise_op_;
+        Block2TileMap block_2_tile_map_;
+    };
+    struct Invoker : BaseInvoker
+    {
+        static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const index_t grid_size = arg.block_2_tile_map_.CalculateGridSize(arg.in_grid_desc_);
+            const auto kernel = kernel_nd_permute<GridwisePermute,
+                                                  InGridDesc,
+                                                  OutGridDesc,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  ElementwiseOperation,
+                                                  Block2TileMap>;
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(grid_size),
+                                                        dim3(BlockSize),
+                                                        0,
+                                                        arg.in_grid_desc_,
+                                                        arg.out_grid_desc_,
+                                                        arg.in_dev_buffer_,
+                                                        arg.out_dev_buffer_,
+                                                        arg.elementwise_op_,
+                                                        arg.block_2_tile_map_);
+            return elapsed_time;
+        }
+        float Run(const BaseArgument* arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override final
+        {
+            const auto* const argument = dynamic_cast<const Argument*>(arg);
+            if(!argument)
+            {
+                return NAN;
+            }
+            return Run(*argument, stream_config);
+        }
+    };
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        constexpr auto GetPaddedLength = [](index_t length, index_t tile_length) {
+            return math::integer_divide_ceil(length, tile_length) * tile_length;
+        };
+        constexpr auto IsScalarPerVectorValid =
+            [](index_t length, index_t stride, index_t scalar_per_vector) {
+                if(stride == 1 && length % scalar_per_vector == 0)
+                {
+                    return true;
+                }
+                else if(stride != 1 && scalar_per_vector == 1)
+                {
+                    return true;
+                }
+                return false;
+            };
+        return IsScalarPerVectorValid(arg.in_lengths_[SrcVectorDim],
+                                      arg.in_strides_[SrcVectorDim],
+                                      SrcScalarPerVector) &&
+               IsScalarPerVectorValid(
+                   GetPaddedLength(arg.in_lengths_[SrcVectorDim],
+                                   (SrcVectorDim == NumDim - 2 ? HPerBlock : WPerBlock)),
+                   arg.in_strides_[SrcVectorDim],
+                   SrcScalarPerVector) &&
+               IsScalarPerVectorValid(arg.out_lengths_[DstVectorDim],
+                                      arg.out_strides_[DstVectorDim],
+                                      DstScalarPerVector) &&
+               IsScalarPerVectorValid(
+                   GetPaddedLength(arg.out_lengths_[DstVectorDim],
+                                   (DstVectorDim == NumDim - 2 ? HPerBlock : WPerBlock)),
+                   arg.in_strides_[DstVectorDim],
+                   DstScalarPerVector) &&
+               GridwisePermute::CheckValidity(arg.in_grid_desc_, arg.out_grid_desc_);
+    };
+    // override methods inherited from 'BaseOperator'
+    bool IsSupportedArgument(const BaseArgument* arg) override final
+    {
+        const auto* const argument = dynamic_cast<const Argument*>(arg);
+        if(!argument)
+        {
+            return false;
+        }
+        return IsSupportedArgument(*argument);
+    }
+    // override methods inherited from 'DevicePermute'
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const Lengths& in_lengths,
+                        const Strides& in_strides,
+                        const Lengths& out_lengths,
+                        const Strides& out_strides,
+                        const void* in_dev_buffer,
+                        void* out_dev_buffer,
+                        ElementwiseOperation elementwise_op) override final
+    {
+        return std::make_unique<Argument>(in_lengths,
+                                          in_strides,
+                                          out_lengths,
+                                          out_strides,
+                                          in_dev_buffer,
+                                          out_dev_buffer,
+                                          elementwise_op);
+    }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override final
+    {
+        return std::make_unique<Invoker>();
+    };
+    // other constructor methods
+    template <typename... Args>
+    static std::enable_if_t<std::is_constructible_v<Argument, Args...>, Argument>
+    MakeArgument(Args&&... args) noexcept(std::is_nothrow_constructible_v<Argument, Args...>)
+    {
+        return Argument{std::forward<Args>(args)...};
+    }
+    static std::enable_if_t<std::is_default_constructible_v<Invoker>, Invoker>
+    MakeInvoker() noexcept(std::is_nothrow_default_constructible_v<Invoker>)
+    {
+        return Invoker{};
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -92,6 +92,12 @@ struct GNDHWC : public BaseTensorLayout
    static constexpr const char* name = "GNDHWC";
 };
+// for input bias
+struct GC : public BaseTensorLayout
+{
+    static constexpr const char* name = "GC";
+};
 // input tensor
 // packed NWGC/NHWGC/NDHWGC
 struct NWGC : public BaseTensorLayout
@@ -126,6 +132,12 @@ struct G_NDHW_C : public BaseTensorLayout
    static constexpr const char* name = "G_NDHW_C";
 };
+// for input bias
+struct G_C : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_C";
+};
 // weight tensor
 // packed KCX/KCYX/KCZYX
 struct KCX : public BaseTensorLayout
@@ -296,6 +308,12 @@ struct GNDHWK : public BaseTensorLayout
    static constexpr const char* name = "GNDHWK";
 };
+// for output bias
+struct GK : public BaseTensorLayout
+{
+    static constexpr const char* name = "GK";
+};
 // output tensor
 // packed NWGK/NHWGK/NDHWGK
 struct NWGK : public BaseTensorLayout
@@ -330,6 +348,12 @@ struct G_NDHW_K : public BaseTensorLayout
    static constexpr const char* name = "G_NDHW_K";
 };
+// for output bias
+struct G_K : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_K";
+};
 // K-reduced output tensor (packed)
 struct GNW : public BaseTensorLayout
 {