Merge remote-tracking branch 'origin/develop' into cpu_avx2

e72c0c43 · carlushuang · d714fa15 · 313bbea5 · e72c0c43 · e72c0c43
Commit e72c0c43 authored Mar 26, 2022 by carlushuang
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -10,12 +10,68 @@
 #include "tensor_layout.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
-#include "gridwise_batched_gemm_xdlops_v2r3.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {

+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ComputeBasePrtOfBatch,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_xdlops_v2r3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const index_t num_batches,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
+            const Block2CTileMap block_2_ctile_map)
+{
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  block_2_ctile_map);
+}
+
 template <typename ADataType,
          typename BDataType,
          typename CDataType,
@@ -35,14 +91,14 @@ template <typename ADataType,
          ck::index_t NPerXDL,
          ck::index_t MXdlPerWave,
          ck::index_t NXdlPerWave,
-          typename ABlockTransferThreadClusterLengths_G_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
          typename ABlockTransferThreadClusterArrangeOrder,
          typename ABlockTransferSrcAccessOrder,
          ck::index_t ABlockTransferSrcVectorDim,
          ck::index_t ABlockTransferSrcScalarPerVector,
          ck::index_t ABlockTransferDstScalarPerVector_K1,
          bool ABlockLdsAddExtraM,
-          typename BBlockTransferThreadClusterLengths_G_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
          typename BBlockTransferThreadClusterArrangeOrder,
          typename BBlockTransferSrcAccessOrder,
          ck::index_t BBlockTransferSrcVectorDim,
@@ -57,149 +113,215 @@ struct DeviceBatchedGemmXdl
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
    static constexpr auto I2 = Number<2>{};
-    static constexpr auto I3 = Number<3>{};

    static constexpr auto K1Number = Number<K1>{};

-    static auto
-    MakeAGridDescriptor_G_K0_M_K1(index_t BatchCount, index_t M, index_t K, index_t StrideA)
+    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
    {
        assert(K % K1 == 0);

        const index_t K0 = K / K1;

-        const auto a_grid_desc_g_m_k = [&]() {
+        const auto a_grid_desc_m_k = [&]() {
            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
            {
-                return make_naive_tensor_descriptor(make_tuple(BatchCount, M, K),
-                                                    make_tuple(M * StrideA, StrideA, I1));
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
            }
            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
            {
-                return make_naive_tensor_descriptor(make_tuple(BatchCount, M, K),
-                                                    make_tuple(K * StrideA, I1, StrideA));
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, M));
            }
        }();

        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;

-        const auto a_grid_desc_g_k0_mp_k1 =
-            transform_tensor_descriptor(a_grid_desc_g_m_k,
-                                        make_tuple(make_pass_through_transform(BatchCount),
-                                                   make_unmerge_transform(make_tuple(K0, K1Number)),
+        const auto a_grid_desc_k0_mp_k1 =
+            transform_tensor_descriptor(a_grid_desc_m_k,
+                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
                                                   make_right_pad_transform(M, PadM)),
-                                        make_tuple(Sequence<0>{}, Sequence<2>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}));
+                                        make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));

-        return a_grid_desc_g_k0_mp_k1;
+        return a_grid_desc_k0_mp_k1;
    }

-    static auto
-    MakeBGridDescriptor_G_K0_N_K1(index_t BatchCount, index_t K, index_t N, index_t StrideB)
+    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
    {
        assert(K % K1 == 0);

        const index_t K0 = K / K1;

-        const auto b_grid_desc_g_k_n = [&]() {
+        const auto b_grid_desc_k_n = [&]() {
            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
            {
-                return make_naive_tensor_descriptor(make_tuple(BatchCount, K, N),
-                                                    make_tuple(K * StrideB, StrideB, I1));
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
            }
            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
            {
-                return make_naive_tensor_descriptor(make_tuple(BatchCount, K, N),
-                                                    make_tuple(N * StrideB, I1, StrideB));
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, K));
            }
        }();

        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;

-        const auto b_grid_desc_g_k0_np_k1 =
-            transform_tensor_descriptor(b_grid_desc_g_k_n,
-                                        make_tuple(make_pass_through_transform(BatchCount),
-                                                   make_unmerge_transform(make_tuple(K0, K1Number)),
+        const auto b_grid_desc_k0_np_k1 =
+            transform_tensor_descriptor(b_grid_desc_k_n,
+                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
                                                   make_right_pad_transform(N, PadN)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}));
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));

-        return b_grid_desc_g_k0_np_k1;
+        return b_grid_desc_k0_np_k1;
    }

-    static auto MakeCGridDescriptor_G_M_N(index_t BatchCount, index_t M, index_t N, index_t StrideC)
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
    {
-        const auto c_grid_desc_g_m_n = [&]() {
+        const auto c_grid_desc_m_n = [&]() {
            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
            {
-                return make_naive_tensor_descriptor(make_tuple(BatchCount, M, N),
-                                                    make_tuple(M * StrideC, StrideC, I1));
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
            }
            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
            {
-                return make_naive_tensor_descriptor(make_tuple(BatchCount, M, N),
-                                                    make_tuple(N * StrideC, I1, StrideC));
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, M));
            }
        }();

        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;

-        const auto c_grid_desc_g_mp_np =
-            transform_tensor_descriptor(c_grid_desc_g_m_n,
-                                        make_tuple(make_pass_through_transform(BatchCount),
-                                                   make_right_pad_transform(M, PadM),
-                                                   make_right_pad_transform(N, PadN)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+        const auto c_grid_desc_mp_np = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));

-        return c_grid_desc_g_mp_np;
+        return c_grid_desc_mp_np;
    }

-    using AGridDesc_G_K0_M_K1 = decltype(MakeAGridDescriptor_G_K0_M_K1(1, 1, 1, 1));
-    using BGridDesc_G_K0_N_K1 = decltype(MakeBGridDescriptor_G_K0_N_K1(1, 1, 1, 1));
-    using CGridDesc_G_M_N     = decltype(MakeCGridDescriptor_G_M_N(1, 1, 1, 1));
-
-    // GridwiseBatchedGemm
-    using GridwiseBatchedGemm = GridwiseBatchedGemm_gk0mk1_gk0nk1_gmn_xdlops_v2r3<
-        BlockSize,
-        ADataType, // TODO: distinguish A/B datatype
-        AccDataType,
-        CDataType,
-        InMemoryDataOperationEnum_t::Set,
-        AGridDesc_G_K0_M_K1,
-        BGridDesc_G_K0_N_K1,
-        CGridDesc_G_M_N,
-        AElementwiseOperation,
-        BElementwiseOperation,
-        CElementwiseOperation,
-        MPerBlock,
-        NPerBlock,
-        K0PerBlock,
-        MPerXDL,
-        NPerXDL,
-        K1,
-        MXdlPerWave,
-        NXdlPerWave,
-        ABlockTransferThreadClusterLengths_G_K0_M_K1,
-        ABlockTransferThreadClusterArrangeOrder,
-        ABlockTransferSrcAccessOrder,
-        ABlockTransferSrcVectorDim,
-        ABlockTransferSrcScalarPerVector,
-        ABlockTransferDstScalarPerVector_K1,
-        false, // AThreadTransferSrcResetCoordinateAfterRun,
-        ABlockLdsAddExtraM,
-        BBlockTransferThreadClusterLengths_G_K0_N_K1,
-        BBlockTransferThreadClusterArrangeOrder,
-        BBlockTransferSrcAccessOrder,
-        BBlockTransferSrcVectorDim,
-        BBlockTransferSrcScalarPerVector,
-        BBlockTransferDstScalarPerVector_K1,
-        false, // BThreadTransferSrcResetCoordinateAfterRun,
-        BBlockLdsAddExtraN,
-        Sequence<0, 1, 3, 5, 6, 7, 2, 4, 8>, // CThreadTransferSrcDstAccessOrder,
-        CThreadTransferSrcDstVectorDim,
-        CThreadTransferDstScalarPerVector>;
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    struct Block2CTileMapMaker
+    {
+        Block2CTileMapMaker(index_t num_batches) : num_batches_(num_batches) {}
+
+        __host__ __device__ constexpr auto
+        MakeBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+        {
+            const auto M = c_grid_desc_m_n.GetLength(I0);
+            const auto N = c_grid_desc_m_n.GetLength(I1);
+
+            constexpr auto M1 = Number<MPerBlock>{};
+            constexpr auto N1 = Number<NPerBlock>{};
+
+            const auto M0 = M / M1;
+            const auto N0 = N / N1;
+
+            const auto M00 = M0 / M01;
+            const auto N00 = N0 / N01;
+
+            const auto g_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_insert_transform(num_batches_),
+                               make_unmerge_transform(make_tuple(M00, M01)),
+                               make_unmerge_transform(make_tuple(N00, N01))),
+                    make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}));
+
+            const auto globalblockid_to_m00_m01_n00_n01_block_cluster_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(num_batches_, M00, N00, M01, N01))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto globalblockid_to_m0_n0_block_cluster_adaptor =
+                chain_tensor_adaptors(g_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor,
+                                      globalblockid_to_m00_m01_n00_n01_block_cluster_adaptor);
+
+            return globalblockid_to_m0_n0_block_cluster_adaptor;
+        }
+
+        private:
+        index_t num_batches_;
+    };
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideC_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                ADataType, // TODO: distinguish A/B datatype
+                                                AccDataType,
+                                                CDataType,
+                                                InMemoryDataOperationEnum_t::Set,
+                                                AGridDesc_K0_M_K1,
+                                                BGridDesc_K0_N_K1,
+                                                CGridDesc_M_N,
+                                                AElementwiseOperation,
+                                                BElementwiseOperation,
+                                                CElementwiseOperation,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                K0PerBlock,
+                                                MPerXDL,
+                                                NPerXDL,
+                                                K1,
+                                                MXdlPerWave,
+                                                NXdlPerWave,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                false, // AThreadTransferSrcResetCoordinateAfterRun,
+                                                ABlockLdsAddExtraM,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                false, // BThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockLdsAddExtraN,
+                                                Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector>;
+
+    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
+    using Block2CTileMap =
+        decltype(Block2CTileMapMaker{1}.MakeBlock2CTileMap(CGridDesc_M_N{}, 1, 1));

    // Argument
    struct Argument : public BaseArgument
@@ -222,10 +344,16 @@ struct DeviceBatchedGemmXdl
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
-              a_grid_desc_g_k0_m_k1_{},
-              b_grid_desc_g_k0_n_k1_{},
-              c_grid_desc_g_m_n_{},
-              c_grid_desc_g_m0_n0_m1_n1_m2_m3_m4_n2_{},
+              BatchCount_(BatchCount),
+              a_grid_desc_k0_m_k1_{
+                  DeviceBatchedGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA)},
+              b_grid_desc_k0_n_k1_{
+                  DeviceBatchedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB)},
+              c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)},
+              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
+              compute_base_ptr_of_batch_{a_grid_desc_k0_m_k1_.GetElementSpaceSize(),
+                                         b_grid_desc_k0_n_k1_.GetElementSpaceSize(),
+                                         c_grid_desc_m_n_.GetElementSpaceSize()},
              block_2_ctile_map_{},
              M01_{M01},
              N01_{N01},
@@ -233,22 +361,14 @@ struct DeviceBatchedGemmXdl
              b_element_op_{b_element_op},
              c_element_op_{c_element_op}
        {
-            a_grid_desc_g_k0_m_k1_ =
-                DeviceBatchedGemmXdl::MakeAGridDescriptor_G_K0_M_K1(BatchCount, M, K, StrideA);
-            b_grid_desc_g_k0_n_k1_ =
-                DeviceBatchedGemmXdl::MakeBGridDescriptor_G_K0_N_K1(BatchCount, K, N, StrideB);
-            c_grid_desc_g_m_n_ =
-                DeviceBatchedGemmXdl::MakeCGridDescriptor_G_M_N(BatchCount, M, N, StrideC);
-
-            if(GridwiseBatchedGemm::CheckValidity(
-                   a_grid_desc_g_k0_m_k1_, b_grid_desc_g_k0_n_k1_, c_grid_desc_g_m_n_, M01_, N01_))
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, c_grid_desc_m_n_, M01_, N01_))
            {
-                c_grid_desc_g_m0_n0_m1_n1_m2_m3_m4_n2_ =
-                    GridwiseBatchedGemm::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
-                        c_grid_desc_g_m_n_);
+                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);

                block_2_ctile_map_ =
-                    GridwiseBatchedGemm::MakeDefaultBlock2CTileMap(c_grid_desc_g_m_n_, M01, N01);
+                    Block2CTileMapMaker{BatchCount}.MakeBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
            }
        }

@@ -256,12 +376,13 @@ struct DeviceBatchedGemmXdl
        const ADataType* p_a_grid_;
        const BDataType* p_b_grid_;
        CDataType* p_c_grid_;
-        AGridDesc_G_K0_M_K1 a_grid_desc_g_k0_m_k1_;
-        BGridDesc_G_K0_N_K1 b_grid_desc_g_k0_n_k1_;
-        CGridDesc_G_M_N c_grid_desc_g_m_n_;
-        typename GridwiseBatchedGemm::CGridDesc_G_M0_N0_M1_N1_M2_M3_M4_N2
-            c_grid_desc_g_m0_n0_m1_n1_m2_m3_m4_n2_;
-        typename GridwiseBatchedGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        index_t BatchCount_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+        Block2CTileMap block_2_ctile_map_;
        index_t M01_;
        index_t N01_;
        AElementwiseOperation a_element_op_;
@@ -277,57 +398,51 @@ struct DeviceBatchedGemmXdl
        float Run(const Argument& arg, int nrepeat = 1)
        {
            {
-                std::cout << "arg.a_grid_desc_g_k0_m_k1_{"
-                          << arg.a_grid_desc_g_k0_m_k1_.GetLength(I0) << ", "
-                          << arg.a_grid_desc_g_k0_m_k1_.GetLength(I1) << ", "
-                          << arg.a_grid_desc_g_k0_m_k1_.GetLength(I2) << ", "
-                          << arg.a_grid_desc_g_k0_m_k1_.GetLength(I3) << "}" << std::endl;
-
-                std::cout << "arg.b_grid_desc_g_k0_n_k1_{"
-                          << arg.b_grid_desc_g_k0_n_k1_.GetLength(I0) << ", "
-                          << arg.b_grid_desc_g_k0_n_k1_.GetLength(I1) << ", "
-                          << arg.b_grid_desc_g_k0_n_k1_.GetLength(I2) << ", "
-                          << arg.b_grid_desc_g_k0_n_k1_.GetLength(I3) << "}" << std::endl;
-
-                std::cout << "arg.c_grid_desc_g_m_n_{" << arg.c_grid_desc_g_m_n_.GetLength(I0)
-                          << ", " << arg.c_grid_desc_g_m_n_.GetLength(I1) << ", "
-                          << arg.c_grid_desc_g_m_n_.GetLength(I2) << "}" << std::endl;
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{" << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
            }

-            if(!GridwiseBatchedGemm::CheckValidity(arg.a_grid_desc_g_k0_m_k1_,
-                                                   arg.b_grid_desc_g_k0_n_k1_,
-                                                   arg.c_grid_desc_g_m_n_,
-                                                   arg.M01_,
-                                                   arg.N01_))
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.M01_,
+                                            arg.N01_))
            {
                throw std::runtime_error(
                    "wrong! GridwiseBatchedGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
            }

            const index_t grid_size =
-                GridwiseBatchedGemm::CalculateGridSize(arg.c_grid_desc_g_m_n_);
+                GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_) * arg.BatchCount_;

-            const auto K0 = arg.a_grid_desc_g_k0_m_k1_.GetLength(I1);
+            const auto K0 = arg.a_grid_desc_k0_m_k1_.GetLength(I0);

-            const bool has_main_k0_block_loop =
-                GridwiseBatchedGemm::CalculateHasMainK0BlockLoop(K0);
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);

            float ave_time = 0;

            if(has_main_k0_block_loop)
            {
                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
-                    GridwiseBatchedGemm,
+                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    CDataType,
-                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_G_K0_M_K1>,
-                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_G_K0_N_K1>,
-                    remove_reference_t<
-                        typename GridwiseBatchedGemm::CGridDesc_G_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
                    AElementwiseOperation,
                    BElementwiseOperation,
                    CElementwiseOperation,
-                    remove_reference_t<typename GridwiseBatchedGemm::DefaultBlock2CTileMap>,
+                    ComputeBasePtrOfStridedBatch,
+                    remove_reference_t<Block2CTileMap>,
                    true>;

                ave_time = launch_and_time_kernel(kernel,
@@ -338,28 +453,30 @@ struct DeviceBatchedGemmXdl
                                                  arg.p_a_grid_,
                                                  arg.p_b_grid_,
                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_g_k0_m_k1_,
-                                                  arg.b_grid_desc_g_k0_n_k1_,
-                                                  arg.c_grid_desc_g_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.BatchCount_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
                                                  arg.a_element_op_,
                                                  arg.b_element_op_,
                                                  arg.c_element_op_,
+                                                  arg.compute_base_ptr_of_batch_,
                                                  arg.block_2_ctile_map_);
            }
            else
            {
                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
-                    GridwiseBatchedGemm,
+                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    CDataType,
-                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_G_K0_M_K1>,
-                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_G_K0_N_K1>,
-                    remove_reference_t<
-                        typename GridwiseBatchedGemm::CGridDesc_G_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
                    AElementwiseOperation,
                    BElementwiseOperation,
                    CElementwiseOperation,
-                    remove_reference_t<typename GridwiseBatchedGemm::DefaultBlock2CTileMap>,
+                    ComputeBasePtrOfStridedBatch,
+                    remove_reference_t<Block2CTileMap>,
                    false>;

                ave_time = launch_and_time_kernel(kernel,
@@ -370,12 +487,14 @@ struct DeviceBatchedGemmXdl
                                                  arg.p_a_grid_,
                                                  arg.p_b_grid_,
                                                  arg.p_c_grid_,
-                                                  arg.a_grid_desc_g_k0_m_k1_,
-                                                  arg.b_grid_desc_g_k0_n_k1_,
-                                                  arg.c_grid_desc_g_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.BatchCount_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
                                                  arg.a_element_op_,
                                                  arg.b_element_op_,
                                                  arg.c_element_op_,
+                                                  arg.compute_base_ptr_of_batch_,
                                                  arg.block_2_ctile_map_);
            }

@@ -397,11 +516,11 @@ struct DeviceBatchedGemmXdl

    static bool IsSupportedArgument(const Argument& arg)
    {
-        return GridwiseBatchedGemm::CheckValidity(arg.a_grid_desc_g_k0_m_k1_,
-                                                  arg.b_grid_desc_g_k0_n_k1_,
-                                                  arg.c_grid_desc_g_m_n_,
-                                                  arg.M01_,
-                                                  arg.N01_);
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.M01_,
+                                           arg.N01_);
    }

    // polymorphic

--- a/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -459,6 +459,16 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            {
                for(index_t i_xtilda = 0; i_xtilda < XTilda; ++i_xtilda)
                {
+                    // check slice is valid
+                    const index_t Y      = filter_spatial_lengths_[0];
+                    const index_t X      = filter_spatial_lengths_[1];
+                    const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilda, YTilda);
+                    const auto XDotSlice = math::integer_divide_ceil(X - i_xtilda, XTilda);
+                    if(YDotSlice * XDotSlice <= 0)
+                    {
+                        continue;
+                    }
+
                    const auto descs = DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
                        N,
                        K,

--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -875,7 +875,8 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << K0PerBlock << ", "
+            << getConvFwdSpecializationStr(ConvForwardSpecialization)
            << ">";
        // clang-format on


--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -466,7 +466,6 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
            }
 #endif
-
            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
                                            arg.b_grid_desc_k0_n_k1_,
                                            arg.c_grid_desc_m_n_,
@@ -708,7 +707,8 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << K0PerBlock << ", "
+            << getConvFwdSpecializationStr(ConvForwardSpecialization)
            << ">";
        // clang-format on


--- a/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -207,41 +207,28 @@ struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_
        const index_t Ho = output_spatial_lengths[1];
        const index_t Wo = output_spatial_lengths[2];

-        if constexpr(ConvForwardSpecialization ==
-                     ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
-        {
-            static_assert(ConvForwardSpecialization == -1, "Not implemented!");
-        }
-        else if constexpr(ConvForwardSpecialization ==
-                          ConvolutionForwardSpecialization_t::Filter1x1Pad0)
-        {
-            static_assert(ConvForwardSpecialization == -1, "Not implemented!");
-        }
-        else
-        {
-            const auto in_desc_n_di_hi_wi_c =
-                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
-            const auto wei_desc_k_z_y_x_c =
-                make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C));
-            const auto out_desc_n_do_ho_wo_k =
-                make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K));
-
-            const auto descs =
-                transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad(
-                    in_desc_n_di_hi_wi_c,
-                    wei_desc_k_z_y_x_c,
-                    out_desc_n_do_ho_wo_k,
-                    make_tuple(
-                        conv_filter_strides[0], conv_filter_strides[1], conv_filter_strides[2]),
-                    make_tuple(conv_filter_dilations[0],
-                               conv_filter_dilations[1],
-                               conv_filter_dilations[2]),
-                    make_tuple(input_left_pads[0], input_left_pads[1], input_left_pads[2]),
-                    make_tuple(input_right_pads[0], input_right_pads[1], input_right_pads[2]),
-                    Number<K1>{});
-
-            return descs;
-        }
+        static_assert(ConvForwardSpecialization == ConvolutionForwardSpecialization_t::Default,
+                      "Wrong! This specialization not implemented!");
+
+        const auto in_desc_n_di_hi_wi_c =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+        const auto wei_desc_k_z_y_x_c =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C));
+        const auto out_desc_n_do_ho_wo_k =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K));
+
+        const auto descs = transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad(
+            in_desc_n_di_hi_wi_c,
+            wei_desc_k_z_y_x_c,
+            out_desc_n_do_ho_wo_k,
+            make_tuple(conv_filter_strides[0], conv_filter_strides[1], conv_filter_strides[2]),
+            make_tuple(
+                conv_filter_dilations[0], conv_filter_dilations[1], conv_filter_dilations[2]),
+            make_tuple(input_left_pads[0], input_left_pads[1], input_left_pads[2]),
+            make_tuple(input_right_pads[0], input_right_pads[1], input_right_pads[2]),
+            Number<K1>{});
+
+        return descs;
    }

    using ABCGridDescs = remove_cvref_t<decltype(MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(

--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -367,6 +367,155 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
        }
    }

+    template <ck::index_t NDim, typename std::enable_if<NDim == 3, bool>::type = false>
+    static auto GetInputTensorDescriptor(ck::index_t N,
+                                         ck::index_t C,
+                                         ck::index_t gemm_m,
+                                         ck::index_t gemm_k,
+                                         ck::index_t gemm_m_pad,
+                                         const std::vector<ck::index_t>& input_spatial_lengths,
+                                         const std::vector<ck::index_t>& filter_spatial_lengths,
+                                         const std::vector<ck::index_t>& output_spatial_lengths,
+                                         const std::vector<ck::index_t>& conv_filter_strides,
+                                         const std::vector<ck::index_t>& conv_filter_dilations,
+                                         const std::vector<ck::index_t>& input_left_pads,
+                                         const std::vector<ck::index_t>& input_right_pads)
+    {
+        const ck::index_t gemm_k0 = gemm_k / GemmK1Number;
+        const index_t Di          = input_spatial_lengths[0];
+        const index_t Hi          = input_spatial_lengths[1];
+        const index_t Wi          = input_spatial_lengths[2];
+
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
+        {
+            const auto in_gemmmraw_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(gemm_m, gemm_k));
+
+            // in_gemmk0_gemmm_gemmk1_grid_desc
+            return transform_tensor_descriptor(
+                in_gemmmraw_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(gemm_k0, GemmK1Number)),
+                           make_right_pad_transform(gemm_m, gemm_m_pad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization_t::Filter1x1Pad0)
+        {
+            const auto in_n_di_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+
+            const auto in_n_do_ho_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_n_do_ho_wo_c_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(gemm_k0, GemmK1Number)),
+                           make_merge_transform(make_tuple(N, Do, Ho, Wo))),
+                make_tuple(Sequence<4>{}, Sequence<0, 1, 2, 3>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // in_gemmk0_gemmm_gemmk1_grid_desc
+            return transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(gemm_k0),
+                           make_right_pad_transform(gemm_m, gemm_m_pad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+        }
+        else
+        {
+            const index_t Z = filter_spatial_lengths[0];
+            const index_t Y = filter_spatial_lengths[1];
+            const index_t X = filter_spatial_lengths[2];
+
+            const index_t ConvDilationD = conv_filter_dilations[0];
+            const index_t ConvDilationH = conv_filter_dilations[1];
+            const index_t ConvDilationW = conv_filter_dilations[2];
+
+            const index_t InLeftPadD = input_left_pads[0];
+            const index_t InLeftPadH = input_left_pads[1];
+            const index_t InLeftPadW = input_left_pads[2];
+
+            const index_t InRightPadD = input_right_pads[0];
+            const index_t InRightPadH = input_right_pads[1];
+            const index_t InRightPadW = input_right_pads[2];
+
+            const auto in_n_di_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Di, InLeftPadD, InRightPadD),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<1, 2>{},
+                           Sequence<3, 4>{},
+                           Sequence<5, 6>{},
+                           Sequence<7>{}));
+
+            const auto in_gemmk_gemmmraw_grid_desc = transform_tensor_descriptor(
+                in_n_z_do_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(Z, Y, X, C)),
+                           make_merge_transform(make_tuple(N, Do, Ho, Wo))),
+                make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk_gemmmraw_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(gemm_k0, GemmK1Number)),
+                           make_pass_through_transform(gemm_m)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // in_gemmk0_gemmm_gemmk1_grid_desc
+            return transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(gemm_k0),
+                           make_right_pad_transform(gemm_m, gemm_m_pad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+        }
+    }
+
    static index_t GetGemmMRaw(ck::index_t N,
                               const std::vector<ck::index_t>& output_spatial_lengths)
    {
@@ -445,6 +594,13 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1});
    }

+    template <ck::index_t NDim, typename std::enable_if<NDim == 3, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+            1, 1, 1, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1});
+    }
+
    using ABCGridDescs = decltype(GetABCGridDesc<NumDimSpatial>());

    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
@@ -593,6 +749,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K

        float Run(const Argument& arg, int nrepeat = 1)
        {
+#if 0
            {
                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
@@ -605,7 +762,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
            }
-
+#endif
            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
                                            arg.b_grid_desc_k0_n_k1_,
                                            arg.c_grid_desc_m_n_,
@@ -704,6 +861,22 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K

    static bool IsSupportedArgument(const Argument& arg)
    {
+        // Input tensors can't be bigger than 2GB each.
+        constexpr std::size_t GB2 = 2 * 1e9;
+
+        if(arg.a_grid_desc_k0_m_k1_.GetElementSpaceSize() > GB2)
+        {
+            return false;
+        }
+        if(arg.b_grid_desc_k0_n_k1_.GetElementSpaceSize() > GB2)
+        {
+            return false;
+        }
+        if(arg.c_grid_desc_m_n_.GetElementSpaceSize() > GB2)
+        {
+            return false;
+        }
+
        if constexpr(ConvForwardSpecialization ==
                     ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
        {
@@ -851,7 +1024,8 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << K0PerBlock << ", "
+            << getConvFwdSpecializationStr(ConvForwardSpecialization)
            << ">";
        // clang-format on


--- a/include/ck/tensor_operation/gpu/device/device_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
-#ifndef DEVICE_GEMM_HPP
-#define DEVICE_GEMM_HPP
-
+#pragma once
 #include <iostream>
 #include "device_base.hpp"

@@ -8,35 +6,12 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-template <typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct DeviceGemmBias : public BaseOperator
+struct GemmShape
 {
-    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* p_a,
-                        const void* p_b,
-                        const void* p_bias,
-                        void* p_c,
-                        ck::index_t M,
-                        ck::index_t N,
-                        ck::index_t K,
-                        ck::index_t StrideA,
-                        ck::index_t StrideB,
-                        ck::index_t StrideC,
-                        AElementwiseOperation a_element_op,
-                        BElementwiseOperation b_element_op,
-                        CElementwiseOperation c_element_op) = 0;
-
-    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    ck::index_t M, N, K;
+    ck::index_t StrideA, StrideB, StrideC;
 };

-template <typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-using DeviceGemmBiasPtr = std::unique_ptr<
-    DeviceGemmBias<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
-
 template <typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation>
@@ -65,7 +40,29 @@ template <typename AElementwiseOperation,
 using DeviceGemmPtr = std::unique_ptr<
    DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;

+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGroupedGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(std::vector<const void*>& p_a,
+                                                              std::vector<const void*>& p_b,
+                                                              std::vector<void*>& p_c,
+                                                              std::vector<GemmShape>& gemm_shapes,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t KBatch = 1) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGroupedGemmPtr = std::unique_ptr<
+    DeviceGroupedGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
-#endif
--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias.hpp
+#pragma once
+#include <iostream>
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmBias : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_bias,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmBiasPtr = std::unique_ptr<
+    DeviceGemmBias<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+#pragma once
+#include <iostream>
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename D0ReduceOperation,
+          typename D1ReduceOperation>
+struct DeviceGemmReduce : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              void* p_d0,
+                                                              void* p_d1,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              D0ReduceOperation d0_reduce_op,
+                                                              D1ReduceOperation d1_reduce_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename D0ReduceOperation,
+          typename D1ReduceOperation>
+using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation,
+                                                             D0ReduceOperation,
+                                                             D1ReduceOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_gemm_reduce.hpp"
+#include "common_header.hpp"
+#include "tensor_layout.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_reduce_xdl_cshuffle_v1.hpp"
+#include "gemm_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename ReduceAccDataType,
+          typename DDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename D0ReduceOperation,
+          typename D1ReduceOperation,
+          GemmSpecialization_t GemmSpecialization,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock>
+struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOperation,
+                                                               BElementwiseOperation,
+                                                               CElementwiseOperation,
+                                                               D0ReduceOperation,
+                                                               D1ReduceOperation>
+{
+    using DeviceOp = DeviceGemmReduce_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::MKPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::MPadding ||
+                          GemmSpecialization == GemmSpecialization_t::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::KPadding ||
+                          GemmSpecialization == GemmSpecialization_t::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::NKPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::NPadding ||
+                          GemmSpecialization == GemmSpecialization_t::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::KPadding ||
+                          GemmSpecialization == GemmSpecialization_t::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::MPadding ||
+                          GemmSpecialization == GemmSpecialization_t::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::NPadding ||
+                          GemmSpecialization == GemmSpecialization_t::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    // assume D is packed tensor
+    static auto MakeDGridDescriptor_M(index_t MRaw)
+    {
+        const auto d_grid_desc_mraw = make_naive_tensor_descriptor_packed(make_tuple(MRaw));
+
+        const auto M    = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto MPad = M - MRaw;
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::MPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MNPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MKPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MNKPadding)
+        {
+            // pad M
+            return transform_tensor_descriptor(d_grid_desc_mraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad M
+            return d_grid_desc_mraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using DGridDesc_M         = decltype(MakeDGridDescriptor_M(1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        ReduceAccDataType,
+        DDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        D0ReduceOperation,
+        D1ReduceOperation,
+        InMemoryDataOperationEnum_t::Set,
+        InMemoryDataOperationEnum_t::AtomicAdd,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        DGridDesc_M,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+        CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 DDataType* p_d0_grid,
+                 DDataType* p_d1_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op,
+                 D0ReduceOperation d0_reduce_op,
+                 D1ReduceOperation d1_reduce_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              p_d0_grid_{p_d0_grid},
+              p_d1_grid_{p_d1_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              d_grid_desc_m_{DeviceOp::MakeDGridDescriptor_M(MRaw)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              d_grid_desc_mblock_mperblock_{},
+              block_2_ctile_map_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              d0_reduce_op_{d0_reduce_op},
+              d1_reduce_op_{d1_reduce_op}
+        {
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_ak0_m_ak1_, b_grid_desc_bk0_n_bk1_, c_grid_desc_m_n_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+
+                d_grid_desc_mblock_mperblock_ =
+                    GridwiseGemm::MakeDGridDescriptor_MBlock_MPerBlock(d_grid_desc_m_);
+
+                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        DDataType* p_d0_grid_;
+        DDataType* p_d1_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        DGridDesc_M d_grid_desc_m_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DGridDescriptor_MBlock_MPerBlock d_grid_desc_mblock_mperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        D0ReduceOperation d0_reduce_op_;
+        D1ReduceOperation d1_reduce_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, int /* nrepeat */ = 1)
+        {
+#if 0
+            {
+                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+                std::cout << "arg.d_grid_desc_m_{ " << arg.d_grid_desc_m_.GetLength(I0) << "}"
+                          << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size = GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K0 = arg.a_grid_desc_ak0_m_ak1_.GetLength(I0);
+
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            if(has_main_k0_block_loop)
+            {
+                const auto kernel = kernel_gemm_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    DDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    D0ReduceOperation,
+                    D1ReduceOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DGridDescriptor_MBlock_MPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+
+                launch_kernel(kernel,
+                              dim3(grid_size),
+                              dim3(BlockSize),
+                              0,
+                              arg.p_a_grid_,
+                              arg.p_b_grid_,
+                              arg.p_c_grid_,
+                              arg.p_d0_grid_,
+                              arg.p_d1_grid_,
+                              arg.a_element_op_,
+                              arg.b_element_op_,
+                              arg.c_element_op_,
+                              arg.d0_reduce_op_,
+                              arg.d1_reduce_op_,
+                              arg.a_grid_desc_ak0_m_ak1_,
+                              arg.b_grid_desc_bk0_n_bk1_,
+                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                              arg.d_grid_desc_mblock_mperblock_,
+                              arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    DDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    D0ReduceOperation,
+                    D1ReduceOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DGridDescriptor_MBlock_MPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+
+                launch_kernel(kernel,
+                              dim3(grid_size),
+                              dim3(BlockSize),
+                              0,
+                              arg.p_a_grid_,
+                              arg.p_b_grid_,
+                              arg.p_c_grid_,
+                              arg.p_d0_grid_,
+                              arg.p_d1_grid_,
+                              arg.a_element_op_,
+                              arg.b_element_op_,
+                              arg.c_element_op_,
+                              arg.d0_reduce_op_,
+                              arg.d1_reduce_op_,
+                              arg.a_grid_desc_ak0_m_ak1_,
+                              arg.b_grid_desc_bk0_n_bk1_,
+                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                              arg.d_grid_desc_mblock_mperblock_,
+                              arg.block_2_ctile_map_);
+            }
+
+            return 0;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(
+            arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             DDataType* p_d0,
+                             DDataType* p_d1,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op,
+                             D0ReduceOperation d0_reduce_op,
+                             D1ReduceOperation d1_reduce_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        p_d0,
+                        p_d1,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op,
+                        d0_reduce_op,
+                        d1_reduce_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      void* p_d0,
+                                                      void* p_d1,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op,
+                                                      D0ReduceOperation d0_reduce_op,
+                                                      D1ReduceOperation d1_reduce_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          static_cast<DDataType*>(p_d0),
+                                          static_cast<DDataType*>(p_d1),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op,
+                                          d0_reduce_op,
+                                          d1_reduce_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmReduce_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_c_shuffle_bias_2d.hpp
@@ -4,9 +4,7 @@
 #include <iostream>
 #include <sstream>
 #include "device.hpp"
-#include "device_base.hpp"
-#include "device_gemm.hpp"
-#include "device_gemm_xdl.hpp"
+#include "device_gemm_bias.hpp"
 #include "common_header.hpp"
 #include "tensor_layout.hpp"
 #include "tensor_descriptor.hpp"

--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_gemm.hpp"
+#include "common_header.hpp"
+#include "tensor_layout.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdl_cshuffle_v1.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization_t GemmSpecialization,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+struct DeviceGemm_Xdl_CShuffle
+    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
+{
+    using DeviceOp = DeviceGemm_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::MKPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::MPadding ||
+                          GemmSpecialization == GemmSpecialization_t::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::KPadding ||
+                          GemmSpecialization == GemmSpecialization_t::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::NKPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::NPadding ||
+                          GemmSpecialization == GemmSpecialization_t::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::KPadding ||
+                          GemmSpecialization == GemmSpecialization_t::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding ||
+                     GemmSpecialization == GemmSpecialization_t::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::MPadding ||
+                          GemmSpecialization == GemmSpecialization_t::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpecialization == GemmSpecialization_t::NPadding ||
+                          GemmSpecialization == GemmSpecialization_t::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum_t::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_ak0_m_ak1_, b_grid_desc_bk0_n_bk1_, c_grid_desc_m_n_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+
+                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, int /* nrepeat */ = 1)
+        {
+#if 0
+            {
+                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size = GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K0 = arg.a_grid_desc_ak0_m_ak1_.GetLength(I0);
+
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            if(has_main_k0_block_loop)
+            {
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+
+                launch_kernel(kernel,
+                              dim3(grid_size),
+                              dim3(BlockSize),
+                              0,
+                              arg.p_a_grid_,
+                              arg.p_b_grid_,
+                              arg.p_c_grid_,
+                              arg.a_element_op_,
+                              arg.b_element_op_,
+                              arg.c_element_op_,
+                              arg.a_grid_desc_ak0_m_ak1_,
+                              arg.b_grid_desc_bk0_n_bk1_,
+                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                              arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+
+                launch_kernel(kernel,
+                              dim3(grid_size),
+                              dim3(BlockSize),
+                              0,
+                              arg.p_a_grid_,
+                              arg.p_b_grid_,
+                              arg.p_c_grid_,
+                              arg.a_element_op_,
+                              arg.b_element_op_,
+                              arg.c_element_op_,
+                              arg.a_grid_desc_ak0_m_ak1_,
+                              arg.b_grid_desc_bk0_n_bk1_,
+                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                              arg.block_2_ctile_map_);
+            }
+
+            return 0;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(
+            arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op,
+                                                      index_t /* KBatch */ = 1) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemm_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+#ifndef DEVICE_GROUPED_GEMM_XDL_HPP
+#define DEVICE_GROUPED_GEMM_XDL_HPP
+
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_base.hpp"
+#include "device_gemm.hpp"
+#include "common_header.hpp"
+#include "tensor_layout.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
+#include "gemm_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization_t GemmSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          ck::index_t NumPrefetch   = 1,
+          ck::index_t MaxGroupCount = 10>
+struct DeviceGroupedGemmXdl
+    : public DeviceGroupedGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto K1Number = Number<K1>{};
+
+    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(M, PadM)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding)
+        {
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpecialization == GemmSpecialization_t::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum_t::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        Sequence<0, 2, 4, 5, 6, 1, 3, 7>, // CThreadTransferSrcDstAccessOrder,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector,
+        NumPrefetch>;
+
+    struct GroupedGemmBlock2CTileMap
+    {
+        GroupedGemmBlock2CTileMap()
+        {
+            block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1);
+            BlockStart_        = -1;
+        }
+
+        GroupedGemmBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n,
+                                  index_t M01,
+                                  index_t N01,
+                                  ck::index_t BlockStart)
+        {
+            block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, M01, N01);
+            BlockStart_        = BlockStart;
+        }
+
+        template <typename TopIdx>
+        __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+        {
+            return block_2_ctile_map_.CalculateBottomIndex(
+                make_multi_index(idx_top[I0] - BlockStart_));
+        }
+
+        private:
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        ck::index_t BlockStart_;
+    };
+
+    struct GemmDescKernelArg
+    {
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+
+        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
+            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
+
+        GroupedGemmBlock2CTileMap grouped_gemm_block_2_ctile_map_;
+
+        const ADataType* a_ptr;
+        const BDataType* b_ptr;
+        CDataType* c_ptr;
+
+        ck::index_t BlockStart_, BlockEnd_;
+    };
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(std::vector<const void*>& p_a,
+                 std::vector<const void*>& p_b,
+                 std::vector<void*>& p_c,
+                 std::vector<GemmShape>& gemm_shapes,
+                 index_t M01,
+                 index_t N01,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : M01_{M01},
+              N01_{N01},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+            grid_size_ = 0;
+
+            group_count_ = static_cast<int>(gemm_shapes.size());
+
+            if(!(group_count_ == p_a.size() && group_count_ == p_b.size() &&
+                 group_count_ == p_c.size()))
+            {
+                throw std::runtime_error("wrong! group_count_ != P_a/b/c.size");
+            }
+
+            gemm_desc_kernel_arg_.reserve(group_count_);
+
+            for(index_t i = 0; i < gemm_shapes.size(); i++)
+            {
+                const index_t M = gemm_shapes[i].M;
+                const index_t N = gemm_shapes[i].N;
+                const index_t K = gemm_shapes[i].K;
+
+                const index_t StrideA = gemm_shapes[i].StrideA;
+                const index_t StrideB = gemm_shapes[i].StrideB;
+                const index_t StrideC = gemm_shapes[i].StrideC;
+
+                const auto a_grid_desc_k0_m_k1_ =
+                    DeviceGroupedGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
+                const auto b_grid_desc_k0_n_k1_ =
+                    DeviceGroupedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
+                const auto c_grid_desc_m_n_ =
+                    DeviceGroupedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC);
+
+                const index_t grid_size_grp = GridwiseGemm::CalculateGridSize(c_grid_desc_m_n_);
+
+                const index_t BlockStart = grid_size_;
+                const index_t BlockEnd   = grid_size_ + grid_size_grp;
+
+                grid_size_ += grid_size_grp;
+
+                if(GridwiseGemm::CheckValidity(
+                       a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, c_grid_desc_m_n_, M01_, N01_))
+                {
+                    const auto c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
+                        GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
+
+                    const auto grouped_gemm_block_2_ctile_map_ =
+                        GroupedGemmBlock2CTileMap(c_grid_desc_m_n_, M01, N01, BlockStart);
+
+                    gemm_desc_kernel_arg_.push_back(
+                        GemmDescKernelArg{a_grid_desc_k0_m_k1_,
+                                          b_grid_desc_k0_n_k1_,
+                                          c_grid_desc_m_n_,
+                                          c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                          grouped_gemm_block_2_ctile_map_,
+                                          static_cast<const ADataType*>(p_a[i]),
+                                          static_cast<const BDataType*>(p_b[i]),
+                                          static_cast<CDataType*>(p_c[i]),
+                                          BlockStart,
+                                          BlockEnd});
+                }
+            }
+        }
+
+        //  private:
+        index_t M01_;
+        index_t N01_;
+        index_t group_count_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+
+        std::vector<GemmDescKernelArg> gemm_desc_kernel_arg_;
+
+        index_t grid_size_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceGroupedGemmXdl::Argument;
+
+        float Run(const Argument& arg, int nrepeat = 1)
+        {
+            StaticallyIndexedArray<GemmDescKernelArg, MaxGroupCount> gemm_desc_kernel_arg_arg;
+
+            bool has_main_k0_block_loop = true;
+
+            static_for<0, MaxGroupCount, 1>{}([&](auto i) {
+                if(i < arg.gemm_desc_kernel_arg_.size())
+                {
+                    gemm_desc_kernel_arg_arg(i) = arg.gemm_desc_kernel_arg_[i];
+
+                    std::cout << "group: " << i << " arg.a_grid_desc_k0_m_k1_{"
+                              << gemm_desc_kernel_arg_arg[i].a_grid_desc_k0_m_k1_.GetLength(I0)
+                              << ", "
+                              << gemm_desc_kernel_arg_arg[i].a_grid_desc_k0_m_k1_.GetLength(I1)
+                              << ", "
+                              << gemm_desc_kernel_arg_arg[i].a_grid_desc_k0_m_k1_.GetLength(I2)
+                              << "}";
+
+                    std::cout << ", arg.b_grid_desc_k0_n_k1_{"
+                              << gemm_desc_kernel_arg_arg[i].b_grid_desc_k0_n_k1_.GetLength(I0)
+                              << ", "
+                              << gemm_desc_kernel_arg_arg[i].b_grid_desc_k0_n_k1_.GetLength(I1)
+                              << ", "
+                              << gemm_desc_kernel_arg_arg[i].b_grid_desc_k0_n_k1_.GetLength(I2)
+                              << "}";
+
+                    std::cout << ", arg.c_grid_desc_m_n_{ "
+                              << gemm_desc_kernel_arg_arg[i].c_grid_desc_m_n_.GetLength(I0) << ", "
+                              << gemm_desc_kernel_arg_arg[i].c_grid_desc_m_n_.GetLength(I1) << "}"
+                              << std::endl;
+
+                    if(!GridwiseGemm::CheckValidity(
+                           gemm_desc_kernel_arg_arg[i].a_grid_desc_k0_m_k1_,
+                           gemm_desc_kernel_arg_arg[i].b_grid_desc_k0_n_k1_,
+                           gemm_desc_kernel_arg_arg[i].c_grid_desc_m_n_,
+                           arg.M01_,
+                           arg.N01_))
+                    {
+                        throw std::runtime_error(
+                            "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+                    }
+
+                    const auto K0 = gemm_desc_kernel_arg_arg[i].a_grid_desc_k0_m_k1_.GetLength(I0);
+
+                    if(GridwiseGemm::CalculateHasMainK0BlockLoop(K0) != has_main_k0_block_loop)
+                    {
+                        throw std::runtime_error("wrong! not all gemm has_main_k0_block_loop");
+                    }
+                }
+            });
+
+            float ave_time = 0;
+
+            if(has_main_k0_block_loop)
+            {
+                const auto kernel =
+                    kernel_grouped_gemm_xdlops_v2r3<GridwiseGemm,
+                                                    ADataType, // TODO: distiguish A/B datatype
+                                                    CDataType,
+                                                    remove_reference_t<GemmDescKernelArg>,
+                                                    AElementwiseOperation,
+                                                    BElementwiseOperation,
+                                                    CElementwiseOperation,
+                                                    true,
+                                                    MaxGroupCount>;
+
+                ave_time = launch_and_time_kernel(kernel,
+                                                  nrepeat,
+                                                  dim3(arg.grid_size_),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  gemm_desc_kernel_arg_arg,
+                                                  arg.gemm_desc_kernel_arg_.size(),
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_);
+            }
+            else
+            {
+                const auto kernel =
+                    kernel_grouped_gemm_xdlops_v2r3<GridwiseGemm,
+                                                    ADataType, // TODO: distiguish A/B datatype
+                                                    CDataType,
+                                                    remove_reference_t<GemmDescKernelArg>,
+                                                    AElementwiseOperation,
+                                                    BElementwiseOperation,
+                                                    CElementwiseOperation,
+                                                    false,
+                                                    MaxGroupCount>;
+
+                ave_time = launch_and_time_kernel(kernel,
+                                                  nrepeat,
+                                                  dim3(arg.grid_size_),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  gemm_desc_kernel_arg_arg,
+                                                  arg.gemm_desc_kernel_arg_.size(),
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg, int nrepeat = 1) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), nrepeat);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(arg.gemm_desc_kernel_arg_.size() != arg.group_count_)
+            return false;
+        else
+            return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::vector<const void*>& p_a,
+                             std::vector<const void*>& p_b,
+                             std::vector<void*>& p_c,
+                             std::vector<GemmShape> gemm_shapes,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a, p_b, p_c, gemm_shapes, 1, 1, a_element_op, b_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(std::vector<const void*>& p_a,
+                                                      std::vector<const void*>& p_b,
+                                                      std::vector<void*>& p_c,
+                                                      std::vector<GemmShape>& gemm_shapes,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op,
+                                                      index_t /* KBatch */ = 1) override
+    {
+        return std::make_unique<Argument>(
+            p_a, p_b, p_c, gemm_shapes, 1, 1, a_element_op, b_element_op, c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedGemmXdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -16,9 +16,11 @@ namespace device {
 template <typename InElementwiseOperation, typename AccElementwiseOperation>
 struct DeviceReduce : public BaseOperator
 {
-    virtual size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths)
+    virtual long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
+                                                 const std::vector<int> reduceDims)
    {
        (void)inLengths;
+        (void)reduceDims;

        return (0);
    };
@@ -32,19 +34,19 @@ struct DeviceReduce : public BaseOperator
    };

    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                        float alpha,
                        float beta,
                        const void* in_dev,
                        void* out_dev,
                        void* out_indices_dev,
                        void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) = 0;
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise.hpp
@@ -36,20 +36,20 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                  "Invalid thread cluster size assignments!");

+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
    using IndexDataType = int32_t;

    static constexpr bool BetaIsZero = NeedIndices;

    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
-    using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;

-    static constexpr index_t srcDims    = Rank;
-    static constexpr index_t dstDims    = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
-    static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
+    static constexpr index_t numSrcDim = Rank;
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);

    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
@@ -57,18 +57,18 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
                                    const std::vector<int>& inStrides)
    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});

        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);

        const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDims)
+            if constexpr(reduceAllDim)
            {
                const auto one_dim_inDesc = transform_tensor_descriptor(
                    inDesc,
                    make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
                    make_tuple(Sequence<0>{}));

                return transform_tensor_descriptor(one_dim_inDesc,
@@ -79,6 +79,9 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
            }
            else
            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
                const auto reduceDimLengths =
                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
                const auto invariantDimLengths =
@@ -93,18 +96,20 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
            }
        }();

-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});

-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K =
+            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;

-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));

        return (in_grid_desc_m_k_padded);
    };
@@ -112,44 +117,45 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
                                    const std::vector<int>& outStrides)
    {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});

        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);

        auto out_grid_desc_m = transform_tensor_descriptor(
            outDesc,
            make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
            make_tuple(Sequence<0>{}));

-        const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});

-        const auto inPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto inPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;

-        auto out_grid_desc_m_padded =
-            transform_tensor_descriptor(out_grid_desc_m,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, inPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
        return (out_grid_desc_m_padded);
    };

    struct Argument : public BaseArgument
    {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 const std::vector<int>& reduceDims,
+        Argument(const std::vector<int> inLengths,
+                 const std::vector<int> inStrides,
+                 const std::vector<int> outLengths,
+                 const std::vector<int> outStrides,
+                 const std::vector<int> reduceDims,
                 float alpha,
                 float beta,
                 const InDataType* in_dev,
                 OutDataType* out_dev,
                 IndexDataType* out_indices_dev,
                 AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const AccElementwiseOperation& acc_elementwise_op)
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
            : outLengths_{outLengths},
              outStrides_{outStrides},
              in_dev_{in_dev},
@@ -160,21 +166,21 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
        {
            (void)workspace_dev;

-            std::tie(inLengths_, inStrides_) =
-                shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);

-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);

            std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, ReduceDims>(inLengths_);
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);

-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
                invariant_lowest_length = 1;
            else
-                invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];

-            reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
+            reduce_lowest_length = inLengths_[Rank - 1];

            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
                       M_BlockTileSize;
@@ -186,7 +192,7 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
        std::vector<int> outStrides_;

        AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;

        const InDataType* in_dev_;
        OutDataType* out_dev_;
@@ -278,18 +284,22 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl

        if constexpr(InSrcVectorDim == 0)
        {
-            if constexpr(InvariantDims::Size() == 0)
-                return (false);
-
-            if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
+            if constexpr(NumInvariantDim == 0)
+            {
                return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);

-            if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                return (false);
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
        }
        else
        {
-            if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
+            if(pArg->inStrides_[Rank - 1] != 1)
                return (false);

            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
@@ -308,19 +318,19 @@ struct DeviceReduceBlockWise : public DeviceReduce<InElementwiseOperation, AccEl
    };

    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                        float alpha,
                        float beta,
                        const void* in_dev,
                        void* out_dev,
                        void* out_indices_dev,
                        void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
    {
        return std::make_unique<Argument>(inLengths,
                                          inStrides,

--- a/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_blockwise_second_call.hpp
@@ -37,6 +37,10 @@ struct DeviceReduceBlockWiseSecondCall
    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                  "Invalid thread cluster size assignments!");

+    static_assert((InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
    using IndexDataType = int32_t;

    static constexpr bool BetaIsZero = NeedIndices;
@@ -46,12 +50,8 @@ struct DeviceReduceBlockWiseSecondCall
        "InDataType and AccDataType should be the same to use DEviceReduceBlockWiseSecondCall!");

    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;

-    static constexpr index_t dstDims = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;

    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
@@ -65,18 +65,20 @@ struct DeviceReduceBlockWiseSecondCall
        const auto in_grid_desc_m_k =
            make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);

-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});

-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K =
+            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;

-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));

        return (in_grid_desc_m_k_padded);
    };
@@ -84,26 +86,27 @@ struct DeviceReduceBlockWiseSecondCall
    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
                                    const std::vector<int>& outStrides)
    {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});

        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);

        auto out_grid_desc_m = transform_tensor_descriptor(
            outDesc,
            make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
            make_tuple(Sequence<0>{}));

-        const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});

-        const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;

-        auto out_grid_desc_m_padded =
-            transform_tensor_descriptor(out_grid_desc_m,
-                                        make_tuple(make_right_pad_transform(outerLen, outPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
        return (out_grid_desc_m_padded);
    };

@@ -131,8 +134,8 @@ struct DeviceReduceBlockWiseSecondCall
              in_elementwise_op_(in_elementwise_op),
              acc_elementwise_op_(acc_elementwise_op)
        {
-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);

            invariant_total_length = inLengths[0];
            reduce_total_length    = inLengths[1];
@@ -159,7 +162,7 @@ struct DeviceReduceBlockWiseSecondCall
        std::vector<int> outStrides_;

        AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;

        const InDataType* in_dev_;
        OutDataType* out_dev_;
@@ -268,19 +271,19 @@ struct DeviceReduceBlockWiseSecondCall
    };

    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                        float alpha,
                        float beta,
                        const void* in_dev,
                        void* out_dev,
                        void* out_indices_dev,
                        void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
    {
        (void)reduceDims;


--- a/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
@@ -12,38 +12,30 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-// template <typename preUnaryOpType, typename posUnaryOpType>
-// using DeviceReducePtr = std::unique_ptr<DeviceReduce<preUnaryOpType, posUnaryOpType>>;
-
-template <int Rank, typename ReduceDims>
+// here, inLengths[] is already shuffled so that lengths of invariant dims are included before those
+// of reduce dims
+template <int Rank, int NumReduceDim>
 std::pair<size_t, size_t> get_2d_lengths(const std::vector<int>& inLengths)
 {
    static_assert(Rank <= 6, "bigger Rank size not supported!");

-    size_t tensor_total_length = 1;
-    size_t reduce_total_length = 1;
-
-    static_for<0, ReduceDims::Size(), 1>{}(
-        [&](auto i) { reduce_total_length *= inLengths[ReduceDims::At(i)]; });
+    size_t invariant_total_length = 1;
+    size_t reduce_total_length    = 1;

-    static_for<0, Rank, 1>{}([&](auto i) { tensor_total_length *= inLengths[i.value]; });
+    constexpr int NumInvariantDim = Rank - NumReduceDim;

-    return std::make_pair(tensor_total_length / reduce_total_length, reduce_total_length);
-};
-
-template <int x, typename Seq>
-constexpr bool belong()
-{
-    bool inside = false;
+    for(int i = NumInvariantDim; i < Rank; i++)
+        reduce_total_length *= inLengths[i];

-    static_for<0, Seq::Size(), 1>{}([&](auto i) { inside = (inside || (x == Seq::At(i))); });
+    for(int i = 0; i < NumInvariantDim; i++)
+        invariant_total_length *= inLengths[i];

-    return (inside);
+    return std::make_pair(invariant_total_length, reduce_total_length);
 };

 // helper functions using variadic template arguments
 template <index_t... Ns>
-static auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
+auto make_tuple_from_array_and_index_seq(const std::vector<int>& lengths, Sequence<Ns...>)
 {
    return make_tuple(static_cast<index_t>(lengths[Ns])...);
 };
@@ -59,16 +51,12 @@ static auto make_tuple_from_array(const std::vector<int>& lengths, Number<arrayS
 };

 template <index_t Rank, index_t NumReduceDim>
-static inline std::pair<std::vector<int>, std::vector<int>>
-shuffle_tensor_dimensions(const std::vector<int>& dimLengths,
-                          const std::vector<int>& dimStrides,
-                          const std::vector<int>& reduceDims)
+std::vector<int> shuffle_tensor_dimensions(const std::vector<int>& origLengthsStrides,
+                                           const std::vector<int>& reduceDims)
 {
-    std::vector<int> newDimLengths;
-    std::vector<int> newDimStrides;
+    std::vector<int> newLengthsStrides;

-    assert(Rank == dimLengths.size() && Rank == dimStrides.size() &&
-           NumReduceDim == reduceDims.size());
+    assert(Rank == origLengthsStrides.size() && NumReduceDim == reduceDims.size());

    int reduceFlag = 0;

@@ -82,19 +70,17 @@ shuffle_tensor_dimensions(const std::vector<int>& dimLengths,
    for(int i = 0; i < Rank; i++)
        if((reduceFlag & (1 << i)) == 0)
        {
-            newDimLengths.push_back(dimLengths[i]);
-            newDimStrides.push_back(dimStrides[i]);
+            newLengthsStrides.push_back(origLengthsStrides[i]);
        };

    // collect reduce dimensions
    for(int i = 0; i < Rank; i++)
        if((reduceFlag & (1 << i)) > 0)
        {
-            newDimLengths.push_back(dimLengths[i]);
-            newDimStrides.push_back(dimStrides[i]);
+            newLengthsStrides.push_back(origLengthsStrides[i]);
        };

-    return std::make_pair(newDimLengths, newDimStrides);
+    return newLengthsStrides;
 };

 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_atomic_add.hpp
@@ -39,18 +39,18 @@ struct DeviceReduceMultiBlockAtomicAdd
    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                  "Invalid thread cluster size assignments!");

+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
    using IndexDataType = int32_t;

    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
-    using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;

-    static constexpr index_t srcDims    = Rank;
-    static constexpr index_t dstDims    = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
-    static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
+    static constexpr index_t numSrcDim = Rank;
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);

    static constexpr bool support_AtomicAdd =
        std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value;
@@ -67,18 +67,18 @@ struct DeviceReduceMultiBlockAtomicAdd
                                    int blkGroupSize,
                                    int kBlockTileIterations)
    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});

        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);

        const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDims)
+            if constexpr(reduceAllDim)
            {
                const auto one_dim_inDesc = transform_tensor_descriptor(
                    inDesc,
                    make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
                    make_tuple(Sequence<0>{}));

                return transform_tensor_descriptor(one_dim_inDesc,
@@ -89,6 +89,9 @@ struct DeviceReduceMultiBlockAtomicAdd
            }
            else
            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
                const auto reduceDimLengths =
                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
                const auto invariantDimLengths =
@@ -103,19 +106,20 @@ struct DeviceReduceMultiBlockAtomicAdd
            }
        }();

-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});

        const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;

-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));

        return (in_grid_desc_m_k_padded);
    };
@@ -123,44 +127,45 @@ struct DeviceReduceMultiBlockAtomicAdd
    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
                                    const std::vector<int>& outStrides)
    {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});

        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);

        auto out_grid_desc_m = transform_tensor_descriptor(
            outDesc,
            make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
            make_tuple(Sequence<0>{}));

-        const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});

-        const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;

-        auto out_grid_desc_m_padded =
-            transform_tensor_descriptor(out_grid_desc_m,
-                                        make_tuple(make_right_pad_transform(outerLen, outPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
        return (out_grid_desc_m_padded);
    };

    struct Argument : public BaseArgument
    {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 const std::vector<int>& reduceDims,
+        Argument(const std::vector<int> inLengths,
+                 const std::vector<int> inStrides,
+                 const std::vector<int> outLengths,
+                 const std::vector<int> outStrides,
+                 const std::vector<int> reduceDims,
                 float alpha,
                 float beta,
                 const InDataType* in_dev,
                 OutDataType* out_dev,
                 IndexDataType* out_indices_dev,
                 AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const AccElementwiseOperation& acc_elementwise_op)
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
            : outLengths_{outLengths},
              outStrides_{outStrides},
              in_dev_{in_dev},
@@ -171,21 +176,21 @@ struct DeviceReduceMultiBlockAtomicAdd
            (void)out_indices_dev;
            (void)workspace_dev;

-            std::tie(inLengths_, inStrides_) =
-                shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);

-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);

            std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, ReduceDims>(inLengths_);
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);

-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
                invariant_lowest_length = 1;
            else
-                invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];

-            reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
+            reduce_lowest_length = inLengths_[Rank - 1];

            int iterations = 1;
            while(true)
@@ -218,7 +223,7 @@ struct DeviceReduceMultiBlockAtomicAdd
        std::vector<int> outStrides_;

        AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;

        const InDataType* in_dev_;
        OutDataType* out_dev_;
@@ -334,18 +339,22 @@ struct DeviceReduceMultiBlockAtomicAdd

        if constexpr(InSrcVectorDim == 0)
        {
-            if constexpr(InvariantDims::Size() == 0)
-                return (false);
-
-            if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
+            if constexpr(NumInvariantDim == 0)
+            {
                return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);

-            if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                return (false);
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
        }
        else
        {
-            if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
+            if(pArg->inStrides_[Rank - 1] != 1)
                return (false);

            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
@@ -371,19 +380,19 @@ struct DeviceReduceMultiBlockAtomicAdd
    };

    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                        float alpha,
                        float beta,
                        const void* in_dev,
                        void* out_dev,
                        void* out_indices_dev,
                        void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
    {
        return std::make_unique<Argument>(inLengths,
                                          inStrides,

--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock_partial_reduce.hpp
@@ -37,31 +37,35 @@ struct DeviceReduceMultiBlockPartialReduce
    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
                  "Invalid thread cluster size assignments!");

+    static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                      (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
    static_assert(OutDstVectorSize == 1, "OutDstVectorSize must be 1 for MultiBlockPartialReduce!");

    using IndexDataType = int32_t;

    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
-    using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;

-    static constexpr index_t srcDims    = Rank;
-    static constexpr index_t dstDims    = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
-    static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
+    static constexpr index_t numSrcDim = Rank;
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);

    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;

-    size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths) override
+    static constexpr int MaxBlockGroupSize = 256;
+
+    long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
+                                         const std::vector<int> reduceDims) override
    {
        size_t invariant_total_length;
        size_t reduce_total_length;

+        auto inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+
        std::tie(invariant_total_length, reduce_total_length) =
-            get_2d_lengths<Rank, ReduceDims>(inLengths);
+            get_2d_lengths<Rank, NumReduceDim>(inLengths_);

        int iterations = 1;
        while(true)
@@ -69,8 +73,7 @@ struct DeviceReduceMultiBlockPartialReduce
            int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
                                   (K_BlockTileSize * iterations);

-            // we want the blkGroupSize be not more than 128
-            if(testBlkGroupSize <= 128)
+            if(testBlkGroupSize <= MaxBlockGroupSize)
                break;

            iterations++;
@@ -79,11 +82,12 @@ struct DeviceReduceMultiBlockPartialReduce
        int blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
                           (K_BlockTileSize * iterations);

-        size_t workspace_size = invariant_total_length * blkGroupSize;
+        long_index_t workspace_size = invariant_total_length * blkGroupSize;

-        size_t wsSizeInBytes =
-            !NeedIndices ? workspace_size * sizeof(AccDataType)
-                         : workspace_size * (sizeof(AccDataType) + sizeof(int)) + 64 + sizeof(int);
+        long_index_t wsSizeInBytes =
+            !NeedIndices
+                ? workspace_size * sizeof(AccDataType)
+                : workspace_size * (sizeof(AccDataType) + sizeof(int32_t)) + 64 + sizeof(int);

        return (wsSizeInBytes);
    };
@@ -95,18 +99,18 @@ struct DeviceReduceMultiBlockPartialReduce
                                    int blkGroupSize,
                                    int kBlockTileIterations)
    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});

        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);

        const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDims)
+            if constexpr(reduceAllDim)
            {
                const auto one_dim_inDesc = transform_tensor_descriptor(
                    inDesc,
                    make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
                    make_tuple(Sequence<0>{}));

                return transform_tensor_descriptor(one_dim_inDesc,
@@ -117,6 +121,9 @@ struct DeviceReduceMultiBlockPartialReduce
            }
            else
            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
                const auto reduceDimLengths =
                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
                const auto invariantDimLengths =
@@ -131,32 +138,35 @@ struct DeviceReduceMultiBlockPartialReduce
            }
        }();

-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});

        const int reduceSizePerBlock = K_BlockTileSize * kBlockTileIterations;
-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = reduceSizePerBlock * blkGroupSize - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;

-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));

        return (in_grid_desc_m_k_padded);
    };

-    static auto MakeWorkspace2dDescriptor(int outerLen, int blkGroupSize)
+    static auto MakeWorkspace2dDescriptor(int invariantLength, int blkGroupSize)
    {
-        auto ws_desc_m_k = make_naive_tensor_descriptor_packed(make_tuple(outerLen, blkGroupSize));
+        auto ws_desc_m_k =
+            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));

-        const auto wsPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto wsPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;

        auto ws_desc_m_k_padded =
            transform_tensor_descriptor(ws_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, wsPad),
+                                        make_tuple(make_right_pad_transform(invariantLength, wsPad),
                                                   make_pass_through_transform(blkGroupSize)),
                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
@@ -166,19 +176,19 @@ struct DeviceReduceMultiBlockPartialReduce

    struct Argument : public BaseArgument
    {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 const std::vector<int>& reduceDims,
+        Argument(const std::vector<int> inLengths,
+                 const std::vector<int> inStrides,
+                 const std::vector<int> outLengths,
+                 const std::vector<int> outStrides,
+                 const std::vector<int> reduceDims,
                 float alpha,
                 float beta,
                 const InDataType* in_dev,
                 OutDataType* out_dev,
                 IndexDataType* out_indices_dev,
                 AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const AccElementwiseOperation& acc_elementwise_op)
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
            : outLengths_{outLengths},
              outStrides_{outStrides},
              in_dev_{in_dev},
@@ -188,21 +198,21 @@ struct DeviceReduceMultiBlockPartialReduce
              in_elementwise_op_{in_elementwise_op},
              acc_elementwise_op_{acc_elementwise_op}
        {
-            std::tie(inLengths_, inStrides_) =
-                shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);

-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);

            std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, ReduceDims>(inLengths_);
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);

-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
                invariant_lowest_length = 1;
            else
-                invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];

-            reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
+            reduce_lowest_length = inLengths_[Rank - 1];

            int iterations = 1;
            while(true)
@@ -210,8 +220,7 @@ struct DeviceReduceMultiBlockPartialReduce
                int testBlkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
                                       (K_BlockTileSize * iterations);

-                // we want the blkGroupSize be not more than 128
-                if(testBlkGroupSize <= 128)
+                if(testBlkGroupSize <= MaxBlockGroupSize)
                    break;

                iterations++;
@@ -241,7 +250,7 @@ struct DeviceReduceMultiBlockPartialReduce
        std::vector<int> outStrides_;

        AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;

        const InDataType* in_dev_;
        OutDataType* out_dev_;
@@ -337,18 +346,22 @@ struct DeviceReduceMultiBlockPartialReduce

        if constexpr(InSrcVectorDim == 0)
        {
-            if constexpr(InvariantDims::Size() == 0)
-                return (false);
-
-            if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
+            if constexpr(NumInvariantDim == 0)
+            {
                return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);

-            if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                return (false);
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
        }
        else
        {
-            if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
+            if(pArg->inStrides_[Rank - 1] != 1)
                return (false);

            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
@@ -371,19 +384,19 @@ struct DeviceReduceMultiBlockPartialReduce
    };

    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                        float alpha,
                        float beta,
                        const void* in_dev,
                        void* out_dev,
                        void* out_indices_dev,
                        void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const AccElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
    {
        return std::make_unique<Argument>(inLengths,
                                          inStrides,

--- a/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
@@ -36,20 +36,20 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
    static_assert((BlockSize == MThreadClusterSize) && (KThreadClusterSize == 1),
                  "Threadwise can only be called with KThreadClusterSize be 1 !");

+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
    using IndexDataType = int32_t;

    static constexpr bool BetaIsZero = NeedIndices;

    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
-    using InvariantDims =
-        typename conditional<NumInvariantDim == 0,
-                             Sequence<>,
-                             typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type>::type;
-    using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;

-    static constexpr index_t srcDims    = Rank;
-    static constexpr index_t dstDims    = (InvariantDims::Size() == 0) ? 1 : InvariantDims::Size();
-    static constexpr bool reduceAllDims = (InvariantDims::Size() == 0);
+    static constexpr index_t numSrcDim = Rank;
+    static constexpr index_t numDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);

    static constexpr int M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
    static constexpr int K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
@@ -57,18 +57,18 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
    static auto MakeSrc2dDescriptor(const std::vector<int>& inLengths,
                                    const std::vector<int>& inStrides)
    {
-        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<srcDims>{});
-        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<srcDims>{});
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});

        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);

        const auto in_grid_desc_m_k = [&]() {
-            if constexpr(reduceAllDims)
+            if constexpr(reduceAllDim)
            {
                const auto one_dim_inDesc = transform_tensor_descriptor(
                    inDesc,
                    make_tuple(make_merge_transform(tupleSrcLengths)),
-                    make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
                    make_tuple(Sequence<0>{}));

                return transform_tensor_descriptor(one_dim_inDesc,
@@ -79,6 +79,9 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
            }
            else
            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
                const auto reduceDimLengths =
                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
                const auto invariantDimLengths =
@@ -93,18 +96,20 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
            }
        }();

-        const auto outerLen = in_grid_desc_m_k.GetLength(Number<0>{});
-        const auto innerLen = in_grid_desc_m_k.GetLength(Number<1>{});
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});

-        const auto inPad_M = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
-        const auto inPad_K = math::integer_least_multiple(innerLen, K_BlockTileSize) - innerLen;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K =
+            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;

-        auto in_grid_desc_m_k_padded =
-            transform_tensor_descriptor(in_grid_desc_m_k,
-                                        make_tuple(make_right_pad_transform(outerLen, inPad_M),
-                                                   make_right_pad_transform(innerLen, inPad_K)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));

        return (in_grid_desc_m_k_padded);
    };
@@ -112,44 +117,45 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
    static auto MakeDst1dDescriptor(const std::vector<int>& outLengths,
                                    const std::vector<int>& outStrides)
    {
-        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<dstDims>{});
-        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<dstDims>{});
+        const auto tupleDstLengths = make_tuple_from_array(outLengths, Number<numDstDim>{});
+        const auto tupleDstStrides = make_tuple_from_array(outStrides, Number<numDstDim>{});

        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);

        auto out_grid_desc_m = transform_tensor_descriptor(
            outDesc,
            make_tuple(make_merge_transform(tupleDstLengths)),
-            make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}),
            make_tuple(Sequence<0>{}));

-        const auto outerLen = out_grid_desc_m.GetLength(Number<0>{});
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});

-        const auto outPad = math::integer_least_multiple(outerLen, M_BlockTileSize) - outerLen;
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;

-        auto out_grid_desc_m_padded =
-            transform_tensor_descriptor(out_grid_desc_m,
-                                        make_tuple(make_right_pad_transform(outerLen, outPad)),
-                                        make_tuple(Sequence<0>{}),
-                                        make_tuple(Sequence<0>{}));
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
        return (out_grid_desc_m_padded);
    };

    struct Argument : public BaseArgument
    {
-        Argument(const std::vector<int>& inLengths,
-                 const std::vector<int>& inStrides,
-                 const std::vector<int>& outLengths,
-                 const std::vector<int>& outStrides,
-                 const std::vector<int>& reduceDims,
+        Argument(const std::vector<int> inLengths,
+                 const std::vector<int> inStrides,
+                 const std::vector<int> outLengths,
+                 const std::vector<int> outStrides,
+                 const std::vector<int> reduceDims,
                 float alpha,
                 float beta,
                 const InDataType* in_dev,
                 OutDataType* out_dev,
                 IndexDataType* out_indices_dev,
                 AccDataType* workspace_dev,
-                 const InElementwiseOperation& in_elementwise_op,
-                 const OutElementwiseOperation& acc_elementwise_op)
+                 const InElementwiseOperation in_elementwise_op,
+                 const OutElementwiseOperation acc_elementwise_op)
            : outLengths_{outLengths},
              outStrides_{outStrides},
              in_dev_{in_dev},
@@ -161,21 +167,21 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
        {
            (void)workspace_dev;

-            std::tie(inLengths_, inStrides_) =
-                shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, inStrides, reduceDims);
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);

-            alpha_ = static_cast<AccDataType>(alpha);
-            beta_  = static_cast<OutDataType>(beta);
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);

            std::tie(invariant_total_length, reduce_total_length) =
-                get_2d_lengths<Rank, ReduceDims>(inLengths_);
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);

-            if constexpr(InvariantDims::Size() == 0)
+            if constexpr(NumInvariantDim == 0)
                invariant_lowest_length = 1;
            else
-                invariant_lowest_length = inLengths_[InvariantDims::At(InvariantDims::Size() - 1)];
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];

-            reduce_lowest_length = inLengths_[ReduceDims::At(ReduceDims::Size() - 1)];
+            reduce_lowest_length = inLengths_[Rank - 1];

            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
                       M_BlockTileSize;
@@ -187,7 +193,7 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
        std::vector<int> outStrides_;

        AccDataType alpha_;
-        OutDataType beta_;
+        AccDataType beta_;

        const InDataType* in_dev_;
        OutDataType* out_dev_;
@@ -278,18 +284,22 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE

        if constexpr(InSrcVectorDim == 0)
        {
-            if constexpr(InvariantDims::Size() == 0)
-                return (false);
-
-            if(pArg->inStrides_[InvariantDims::At(InvariantDims::Size() - 1)] != 1)
+            if constexpr(NumInvariantDim == 0)
+            {
                return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);

-            if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
-                return (false);
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
        }
        else
        {
-            if(pArg->inStrides_[ReduceDims::At(ReduceDims::Size() - 1)] != 1)
+            if(pArg->inStrides_[Rank - 1] != 1)
                return (false);

            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
@@ -310,19 +320,19 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, OutE
    };

    std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const std::vector<int>& inLengths,
-                        const std::vector<int>& inStrides,
-                        const std::vector<int>& outLengths,
-                        const std::vector<int>& outStrides,
-                        const std::vector<int>& reduceDims,
+    MakeArgumentPointer(const std::vector<int> inLengths,
+                        const std::vector<int> inStrides,
+                        const std::vector<int> outLengths,
+                        const std::vector<int> outStrides,
+                        const std::vector<int> reduceDims,
                        float alpha,
                        float beta,
                        const void* in_dev,
                        void* out_dev,
                        void* out_indices_dev,
                        void* workspace_dev,
-                        const InElementwiseOperation& in_elementwise_op,
-                        const OutElementwiseOperation& acc_elementwise_op) override
+                        const InElementwiseOperation in_elementwise_op,
+                        const OutElementwiseOperation acc_elementwise_op) override
    {
        return std::make_unique<Argument>(inLengths,
                                          inStrides,