merge changes for upstream/latest update

b097be17 · root · 8a891bbd · a49115b9 · b097be17 · b097be17
Commit b097be17 authored Jun 23, 2022 by root
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -557,11 +557,9 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
            float ave_time = 0;
-            using Add =
+            using Add                  = ck::tensor_operation::element_wise::Add;
-                ck::tensor_operation::binary_element_wise::Add<CDataType, CDataType, CDataType>;
+            using Subtract             = ck::tensor_operation::element_wise::Subtract;
-            using Substract = ck::tensor_operation::binary_element_wise::
+            using GridwiseBinAdd       = GridwiseBinaryElementwise_1D<CDataType,
-                Substract<CDataType, CDataType, CDataType>;
-            using GridwiseBinAdd        = GridwiseBinaryElementwise_1D<CDataType,
                                                                CDataType,
                                                                CDataType,
                                                                CDataType,
@@ -573,19 +571,19 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                                AScalarPerVector,
                                                                BScalarPerVector,
                                                                CScalarPerVector>;
-            using GridwiseBinSubstract  = GridwiseBinaryElementwise_1D<CDataType,
+            using GridwiseBinSubtract  = GridwiseBinaryElementwise_1D<CDataType,
-                                                                      CDataType,
+                                                                     CDataType,
-                                                                      CDataType,
+                                                                     CDataType,
-                                                                      CDataType,
+                                                                     CDataType,
-                                                                      CGridDesc_M,
+                                                                     CGridDesc_M,
-                                                                      CGridDesc_M,
+                                                                     CGridDesc_M,
-                                                                      CGridDesc_M,
+                                                                     CGridDesc_M,
-                                                                      Substract,
+                                                                     Subtract,
-                                                                      MPerThread,
+                                                                     MPerThread,
-                                                                      AScalarPerVector,
+                                                                     AScalarPerVector,
-                                                                      BScalarPerVector,
+                                                                     BScalarPerVector,
-                                                                      CScalarPerVector>;
+                                                                     CScalarPerVector>;
-            const auto add_kernel       = kernel_binary_elementwise_1d<GridwiseBinAdd,
+            const auto add_kernel      = kernel_binary_elementwise_1d<GridwiseBinAdd,
                                                                 CDataType,
                                                                 CDataType,
                                                                 CDataType,
@@ -593,14 +591,14 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                                 CGridDesc_M,
                                                                 CGridDesc_M,
                                                                 Add>;
-            const auto substract_kernel = kernel_binary_elementwise_1d<GridwiseBinSubstract,
+            const auto subtract_kernel = kernel_binary_elementwise_1d<GridwiseBinSubtract,
-                                                                       CDataType,
+                                                                      CDataType,
-                                                                       CDataType,
+                                                                      CDataType,
-                                                                       CDataType,
+                                                                      CDataType,
-                                                                       CGridDesc_M,
+                                                                      CGridDesc_M,
-                                                                       CGridDesc_M,
+                                                                      CGridDesc_M,
-                                                                       CGridDesc_M,
+                                                                      CGridDesc_M,
-                                                                       Substract>;
+                                                                      Subtract>;
            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
@@ -653,7 +651,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                // c_real = aux - aux_2
                ave_time += launch_and_time_kernel(stream_config,
-                                                   substract_kernel,
+                                                   subtract_kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -663,7 +661,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                   arg.c_grid_desc_m_,
                                                   arg.c_grid_desc_m_,
                                                   arg.c_grid_desc_m_,
-                                                   Substract{});
+                                                   Subtract{});
                ave_time +=
                    launch_and_time_kernel(stream_config,
@@ -764,7 +762,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                // c_real = aux - aux_2
                ave_time += launch_and_time_kernel(stream_config,
-                                                   substract_kernel,
+                                                   subtract_kernel,
                                                   dim3(grid_size),
                                                   dim3(BlockSize),
                                                   0,
@@ -774,7 +772,7 @@ struct DeviceCGemm_4Gemm_Xdl_CShuffle
                                                   arg.c_grid_desc_m_,
                                                   arg.c_grid_desc_m_,
                                                   arg.c_grid_desc_m_,
-                                                   Substract{});
+                                                   Subtract{});
                ave_time +=
                    launch_and_time_kernel(stream_config,

--- a/include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -11,6 +11,7 @@
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
 #include "gridwise_gemm_xdlops_bwd_weight.hpp"
+#include "gridwise_unary_elementwise_1d.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -432,7 +433,7 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
        using namespace ck;
        const index_t Di = input_spatial_lengths[0];
-        const index_t Hi = input_spatial_lengths[2];
+        const index_t Hi = input_spatial_lengths[1];
        const index_t Wi = input_spatial_lengths[2];
        const index_t Do = output_spatial_lengths[0];
@@ -628,6 +629,57 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                                                                  1);
    }
+    // type convert descs
+    template <typename Desc_M0>
+    static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
+    {
+        const auto m0           = desc_m0.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * 4;
+        const auto pad          = math::integer_least_multiple(m0, loop_step) - m0;
+        const auto desc_m0_pad =
+            transform_tensor_descriptor(desc_m0,
+                                        make_tuple(make_right_pad_transform(m0, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m0_pad;
+    }
+    template <index_t Dim>
+    static auto MakeDescriptor_M0(const std::vector<index_t>& shape,
+                                  const std::vector<index_t>& stride,
+                                  index_t gridSize,
+                                  index_t blockSize)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+        // merge nd to 1d desc - [s0 * s1 * ...]
+        if constexpr(Dim > 1)
+        {
+            const auto desc_m0 = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(tupleOfShape)),
+                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
+                make_tuple(Sequence<0>{}));
+            return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
+        }
+        else
+            return PadDescriptor_M0_1d(desc, gridSize, blockSize);
+    }
+    using TypeConvertFp32ToBf16Functor =
+        ck::tensor_operation::element_wise::UnaryTypeConvert<ck::bhalf_t, float>;
+    using GridDesc_M0      = decltype(MakeDescriptor_M0<1>({1}, {1}, 1, 1));
+    using GridwiseUEltwise = GridwiseUnaryElementwise_1D<AccDataType,
+                                                         InDataType,
+                                                         GridDesc_M0,
+                                                         TypeConvertFp32ToBf16Functor,
+                                                         4>;
    using ABCGridDescs = decltype(GetABCGridDesc<NumDimSpatial>());
    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
@@ -733,6 +785,55 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
        true,
        true>;
+    using GridwiseGemmAtomicAddFloatBf16Splitk = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        AccDataType,
+        InMemoryDataOperationEnum::AtomicAdd,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXdl,
+        NPerXdl,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        ABlockLdsM1PerBlock,
+        ABlockLdsM0PerBlock,
+        ABlockLdsM1Padding,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        BBlockLdsN1PerBlock,
+        BBlockLdsN0PerBlock,
+        BBlockLdsN1Padding,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferScalarPerVector_NWaveNPerXdl,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        true,
+        true>;
    // Argument
    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
@@ -881,76 +982,104 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
-            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
            float ave_time = 0;
-            const auto Run = [&](const auto& kernel) {
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+            const auto run_conv = [&](const auto& kernel) {
                hipGetErrorString(hipMemset(
                    arg.p_c_grid_,
                    0,
                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
                        sizeof(CDataType)));
-                ave_time =
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_grid_desc_kbatch_k0_m_k1_,
+                                              arg.b_grid_desc_kbatch_k0_n_k1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.c_element_op_,
+                                              arg.block_2_ctile_map_);
+            };
+            // run kernel for bf16 with splitk
+            const auto run_bf16_splitk = [&](const auto& kernel) {
+                hipGetErrorString(hipMemset(
+                    arg.p_workspace_,
+                    0,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(AccDataType)));
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              static_cast<AccDataType*>(arg.p_workspace_),
+                                              arg.a_grid_desc_kbatch_k0_m_k1_,
+                                              arg.b_grid_desc_kbatch_k0_n_k1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.c_element_op_,
+                                              arg.block_2_ctile_map_);
+            };
+            // kernel for type conversion
+            std::vector<std::size_t> filter_dims{static_cast<std::size_t>(arg.Conv_K_),
+                                                 static_cast<std::size_t>(arg.Conv_C_)};
+            filter_dims.insert(std::end(filter_dims),
+                               std::begin(arg.filter_spatial_lengths_),
+                               std::end(arg.filter_spatial_lengths_));
+            int tensor_size =
+                std::accumulate(filter_dims.begin(), filter_dims.end(), 1, std::multiplies<int>{});
+            const index_t type_convert_grid_size = GridwiseUEltwise::CalculateGridSize(tensor_size);
+            GridDesc_M0 a_grid_desc_m0_ =
+                MakeDescriptor_M0<1>({tensor_size}, {1}, type_convert_grid_size, 256);
+            GridDesc_M0 b_grid_desc_m0_ =
+                MakeDescriptor_M0<1>({tensor_size}, {1}, type_convert_grid_size, 256);
+            if(!GridwiseUEltwise::CheckValidity(a_grid_desc_m0_, b_grid_desc_m0_))
+            {
+                throw std::runtime_error("wrong! GridwiseUnaryElementwise_1D has invalid setting");
+            }
+            // run kernel for type conversion
+            void* p_c_grid_tmp_            = static_cast<void*>(arg.p_c_grid_);
+            InDataType* p_c_grid_tmp_bf16_ = static_cast<InDataType*>(p_c_grid_tmp_);
+            const auto run_type_convert    = [&](const auto& kernel) {
+                float elapsed_time =
                    launch_and_time_kernel(stream_config,
                                           kernel,
-                                           dim3(grid_size),
+                                           dim3(type_convert_grid_size),
-                                           dim3(BlockSize),
+                                           dim3(256),
                                           0,
-                                           arg.p_a_grid_,
+                                           static_cast<AccDataType*>(arg.p_workspace_),
-                                           arg.p_b_grid_,
+                                           p_c_grid_tmp_bf16_,
-                                           arg.p_c_grid_,
+                                           a_grid_desc_m0_,
-                                           arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           b_grid_desc_m0_,
-                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           TypeConvertFp32ToBf16Functor{});
-                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                return elapsed_time;
-                                           arg.a_element_op_,
-                                           arg.b_element_op_,
-                                           arg.c_element_op_,
-                                           arg.block_2_ctile_map_);
            };
            if constexpr(std::is_same<InDataType, ck::bhalf_t>::value)
            {
-                if(has_main_k0_block_loop)
+                auto launch_kernel = [&](auto has_main_k_block_loop) {
-                {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
-                    const auto kernel = kernel_gemm_xdlops_bwd_weight<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        OutElementwiseOperation,
-                        InElementwiseOperation,
-                        WeiElementwiseOperation,
-                        remove_reference_t<DeviceOp::Block2CTileMap>,
-                        true>;
-                    Run(kernel);
-                }
-                else
-                {
-                    const auto kernel = kernel_gemm_xdlops_bwd_weight<
-                        GridwiseGemm,
-                        ADataType, // TODO: distiguish A/B datatype
-                        CDataType,
-                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
-                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
-                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
-                        OutElementwiseOperation,
-                        InElementwiseOperation,
-                        WeiElementwiseOperation,
-                        remove_reference_t<DeviceOp::Block2CTileMap>,
-                        false>;
-                    Run(kernel);
-                }
-            }
-            else
-            {
-                if(has_main_k0_block_loop)
-                {
                    if(kbatch == 1)
                    {
                        const auto kernel = kernel_gemm_xdlops_bwd_weight<
@@ -965,16 +1094,23 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                            InElementwiseOperation,
                            WeiElementwiseOperation,
                            remove_reference_t<DeviceOp::Block2CTileMap>,
-                            true>;
+                            has_main_loop>;
-                        Run(kernel);
+                        return run_conv(kernel);
                    }
                    else
                    {
-                        const auto kernel = kernel_gemm_xdlops_bwd_weight<
+                        const auto kernel_type_convert =
-                            GridwiseGemmAtomicAdd,
+                            kernel_unary_elementwise_1d<GridwiseUEltwise,
+                                                        AccDataType,
+                                                        InDataType,
+                                                        GridDesc_M0,
+                                                        TypeConvertFp32ToBf16Functor>;
+                        const auto kernel_conv = kernel_gemm_xdlops_bwd_weight<
+                            GridwiseGemmAtomicAddFloatBf16Splitk,
                            ADataType, // TODO: distiguish A/B datatype
-                            CDataType,
+                            AccDataType,
                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
                            remove_reference_t<
@@ -983,13 +1119,28 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                            InElementwiseOperation,
                            WeiElementwiseOperation,
                            remove_reference_t<DeviceOp::Block2CTileMap>,
-                            true>;
+                            has_main_loop>;
-                        Run(kernel);
+                        float elapsed_time = 0;
+                        elapsed_time += run_bf16_splitk(kernel_conv);
+                        elapsed_time += run_type_convert(kernel_type_convert);
+                        return elapsed_time;
                    }
+                };
+                if(has_main_k0_block_loop)
+                {
+                    ave_time = launch_kernel(integral_constant<bool, true>{});
                }
                else
                {
+                    ave_time = launch_kernel(integral_constant<bool, false>{});
+                }
+            }
+            else
+            {
+                auto launch_kernel = [&](auto has_main_k_block_loop) {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
                    if(kbatch == 1)
                    {
                        const auto kernel = kernel_gemm_xdlops_bwd_weight<
@@ -1004,9 +1155,9 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                            InElementwiseOperation,
                            WeiElementwiseOperation,
                            remove_reference_t<DeviceOp::Block2CTileMap>,
-                            false>;
+                            has_main_loop>;
-                        Run(kernel);
+                        return run_conv(kernel);
                    }
                    else
                    {
@@ -1022,10 +1173,18 @@ struct DeviceConvndBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
                            InElementwiseOperation,
                            WeiElementwiseOperation,
                            remove_reference_t<DeviceOp::Block2CTileMap>,
-                            false>;
+                            has_main_loop>;
-                        Run(kernel);
+                        return run_conv(kernel);
                    }
+                };
+                if(has_main_k0_block_loop)
+                {
+                    ave_time = launch_kernel(integral_constant<bool, true>{});
+                }
+                else
+                {
+                    ave_time = launch_kernel(integral_constant<bool, false>{});
                }
            }

--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_gemm_reduce.hpp"
+#include "tensor_layout.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp"
+#include "gemm_specialization.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename C0DataType,
+          typename C1DataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename ReduceAccDataType,
+          typename DPtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename DxsReduceOperation,
+          typename DxsInElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation,
+          typename DGlobalMemoryDataOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGemmBiasAddReduce_Xdl_CShuffle
+    : public DeviceGemmBiasAddReduce<AElementwiseOperation,
+                                     BElementwiseOperation,
+                                     CElementwiseOperation,
+                                     C1ElementwiseOperation,
+                                     DxsInElementwiseOperation,
+                                     DxsReduceAccElementwiseOperation>
+{
+    using DeviceOp = DeviceGemmBiasAddReduce_Xdl_CShuffle;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+            const auto AK0 = K / AK1;
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+            const auto AK0 = KRaw / AK1;
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+            const auto AK0 = K / AK1;
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+            const auto AK0 = KRaw / AK1;
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+            const auto BK0 = K / BK1;
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+            const auto BK0 = KRaw / BK1;
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+            const auto BK0 = K / BK1;
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+            const auto BK0 = KRaw / BK1;
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+    // assume D is packed tensor
+    static auto MakeDGridDescriptor_M(index_t MRaw)
+    {
+        const auto d_grid_desc_mraw = make_naive_tensor_descriptor_packed(make_tuple(MRaw));
+        const auto M    = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto MPad = M - MRaw;
+        if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                     GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M
+            return transform_tensor_descriptor(d_grid_desc_mraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad M
+            return d_grid_desc_mraw;
+        }
+    }
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using C0GridDesc_M_N      = decltype(MakeCGridDescriptor_M_N(1, 1, 0));
+    using C1GridDesc_M_N      = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using DGridDesc_M         = decltype(MakeDGridDescriptor_M(1));
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        C0DataType,
+        C1DataType,
+        ReduceAccDataType,
+        DPtrsGlobal,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        C1ElementwiseOperation,
+        DxsReduceOperation,
+        DxsInElementwiseOperation,
+        DxsReduceAccElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        DGlobalMemoryDataOperation,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        C0GridDesc_M_N,
+        C1GridDesc_M_N,
+        DGridDesc_M,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+        CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+        LoopSched>;
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 const C0DataType* p_c0_grid,
+                 const C1DataType* p_c1_grid,
+                 DPtrsGlobal p_ds_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t StrideC1,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op,
+                 C1ElementwiseOperation c1_element_op,
+                 DxsInElementwiseOperation dxs_in_element_op,
+                 DxsReduceAccElementwiseOperation dxs_out_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              p_c0_grid_{p_c0_grid},
+              p_c1_grid_{p_c1_grid},
+              p_ds_grid_{p_ds_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              c0_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, 0)},
+              c1_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC1)},
+              d_grid_desc_m_{DeviceOp::MakeDGridDescriptor_M(MRaw)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              c0_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              c1_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              d_grid_desc_mblock_mperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              c1_element_op_{c1_element_op},
+              dxs_in_element_op_{dxs_in_element_op},
+              dxs_out_element_op_{dxs_out_element_op}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+                c0_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c0_grid_desc_m_n_);
+                c1_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c1_grid_desc_m_n_);
+                d_grid_desc_mblock_mperblock_ =
+                    GridwiseGemm::MakeDGridDescriptor_MBlock_MPerBlock(d_grid_desc_m_);
+            }
+        }
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        const C0DataType* p_c0_grid_;
+        const C1DataType* p_c1_grid_;
+        DPtrsGlobal p_ds_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        C0GridDesc_M_N c0_grid_desc_m_n_;
+        C1GridDesc_M_N c1_grid_desc_m_n_;
+        DGridDesc_M d_grid_desc_m_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c0_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c1_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DGridDescriptor_MBlock_MPerBlock d_grid_desc_mblock_mperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        C1ElementwiseOperation c1_element_op_;
+        DxsInElementwiseOperation dxs_in_element_op_;
+        DxsReduceAccElementwiseOperation dxs_out_element_op_;
+    };
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            float elapsed_time = 0.0f;
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_bias_add_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    C0DataType,
+                    C1DataType,
+                    DPtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    C1ElementwiseOperation,
+                    DxsInElementwiseOperation,
+                    DxsReduceAccElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DGridDescriptor_MBlock_MPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_c0_grid_,
+                                           arg.p_c1_grid_,
+                                           arg.p_ds_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.c1_element_op_,
+                                           arg.dxs_in_element_op_,
+                                           arg.dxs_out_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c0_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c1_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_bias_add_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    C0DataType,
+                    C1DataType,
+                    DPtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    C1ElementwiseOperation,
+                    DxsInElementwiseOperation,
+                    DxsReduceAccElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DGridDescriptor_MBlock_MPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_c0_grid_,
+                                           arg.p_c1_grid_,
+                                           arg.p_ds_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.c1_element_op_,
+                                           arg.dxs_in_element_op_,
+                                           arg.dxs_out_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c0_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c1_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.d_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+            return elapsed_time;
+        }
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             const C0DataType* p_c0,
+                             const C1DataType* p_c1,
+                             DPtrsGlobal p_dxs,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t StrideC1,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op,
+                             C1ElementwiseOperation c1_element_op,
+                             DxsInElementwiseOperation dxs_in_element_op,
+                             DxsReduceAccElementwiseOperation dxs_out_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        p_c0,
+                        p_c1,
+                        p_dxs,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        StrideC1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op,
+                        c1_element_op,
+                        dxs_in_element_op,
+                        dxs_out_element_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        const void* p_c0,
+                        const void* p_c1,
+                        void* p_dxs,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t StrideA,
+                        index_t StrideB,
+                        index_t StrideC,
+                        index_t StrideC1,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op,
+                        C1ElementwiseOperation c1_element_op,
+                        DxsInElementwiseOperation dxs_in_element_op,
+                        DxsReduceAccElementwiseOperation dxs_out_element_op,
+                        index_t /* KBatch */ = 1) override
+    {
+        DPtrsGlobal dxs_tuple = *(static_cast<DPtrsGlobal*>(p_dxs));
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          static_cast<const C0DataType*>(p_c0),
+                                          static_cast<const C1DataType*>(p_c1),
+                                          dxs_tuple,
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          StrideC1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op,
+                                          c1_element_op,
+                                          dxs_in_element_op,
+                                          dxs_out_element_op);
+    }
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceGemmReduce_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+#pragma once
+#include <array>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// input : A[M, K], B[K, N],
+// input : D0[M, N], D1[M, N], ...
+// output : E[M, N]
+// C = a_op(A) * b_op(B)
+// E = cde_op(C, D0, D1, ...)
+template <ck::index_t NumDTensor,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleD : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <ck::index_t NumDTensor,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmMultipleDPtr = std::unique_ptr<DeviceGemmMultipleD<NumDTensor,
+                                                                   AElementwiseOperation,
+                                                                   BElementwiseOperation,
+                                                                   CElementwiseOperation>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_gemm_multiple_d.hpp"
+#include "common_header.hpp"
+#include "tensor_layout.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "gemm_specialization.hpp"
+#include "device_prop.hpp"
+namespace ck {
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatDsPointer,
+          typename FloatE,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_multiple_d_xdl_cshuffle(const FloatAB* __restrict__ p_a_grid,
+                                            const FloatAB* __restrict__ p_b_grid,
+                                            FloatDsPointer p_ds_grid,
+                                            FloatE* __restrict__ p_e_grid,
+                                            const AElementwiseOperation a_element_op,
+                                            const BElementwiseOperation b_element_op,
+                                            const CDEElementwiseOperation cde_element_op,
+                                            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                                e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_ds_grid,
+                                                  p_e_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+#endif
+}
+} // namespace ck
+namespace ck {
+namespace tensor_operation {
+namespace device {
+// input : A[M, K], or A[K, N]
+// input : B[K, N], or A[N, K]
+// input : D0[M, N], D1[M, N], ...
+// output : E[M, N]
+// C = a_op(A) * b_op(B)
+// E = cde_op(C, D0, D1, ...)
+template <typename ALayout,
+          typename BLayout,
+          typename CDELayout,
+          typename ADataType,
+          typename BDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<DsDataType::Size(),
+                                                                     AElementwiseOperation,
+                                                                     BElementwiseOperation,
+                                                                     CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGemmMultipleD_Xdl_CShuffle;
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+            const auto AK0 = K / AK1;
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+            const auto AK0 = KRaw / AK1;
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+            const auto AK0 = K / AK1;
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+            const auto AK0 = KRaw / AK1;
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+            const auto BK0 = K / BK1;
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+            const auto BK0 = KRaw / BK1;
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+            const auto BK0 = K / BK1;
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+            const auto BK0 = KRaw / BK1;
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CDELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CDELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using EGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        EGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 std::array<index_t, NumDTensor> StrideDs,
+                 index_t StrideE,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{}, // FIXME
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideE)},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+                static_for<0, NumDTensor, 1>{}([&](auto i) {
+                    using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                    p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+                    const auto d_grid_desc_m_n =
+                        DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideDs[i]);
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_(i) =
+                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            d_grid_desc_m_n);
+                });
+            }
+        }
+        // ck::Tuple<const DsDataType*...>
+        static constexpr auto MakeDsGridPointer()
+        {
+            return generate_tuple(
+                [&](auto i) {
+                    using DDataType = remove_cv_t<decltype(DsDataType{}.At(i))>;
+                    return static_cast<const DDataType*>(nullptr);
+                },
+                Number<NumDTensor>{});
+        }
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        StaticallyIndexedArray<
+            typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+            NumDTensor>
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_; // FIXME: Ds desc may be of different
+                                                             // type from E
+        EGridDesc_M_N e_grid_desc_m_n_;
+        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2ETileMap block_2_etile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+                const auto kernel = kernel_gemm_multiple_d_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    ck::StaticallyIndexedArray<
+                        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                        NumDTensor>,
+                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2ETileMap,
+                    has_main_loop>;
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_etile_map_);
+            };
+            float ave_time = 0;
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+            return ave_time;
+        }
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideE,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+    static auto MakeInvoker() { return Invoker{}; }
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t StrideA,
+                        index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideE,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceGemmMultipleD_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -6,19 +6,18 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename DPtrsGlobal,
+template <typename AElementwiseOperation,
-          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsAccElementwiseOperation>
+          typename DxsReduceAccElementwiseOperation>
 struct DeviceGemmReduce : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
                        void* p_c,
-                        DPtrsGlobal p_dxs,
+                        void* p_dxs,
                        ck::index_t M,
                        ck::index_t N,
                        ck::index_t K,
@@ -29,24 +28,69 @@ struct DeviceGemmReduce : public BaseOperator
                        BElementwiseOperation b_element_op,
                        CElementwiseOperation c_element_op,
                        DxsInElementwiseOperation dxs_in_element_op,
-                        DxsAccElementwiseOperation dxs_out_element_op,
+                        DxsReduceAccElementwiseOperation dxs_out_element_op,
                        ck::index_t BatchCount = 1) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
-template <typename DPtrsGlobal,
+template <typename AElementwiseOperation,
-          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
          typename DxsInElementwiseOperation,
-          typename DxsAccElementwiseOperation>
+          typename DxsReduceAccElementwiseOperation>
-using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<DPtrsGlobal,
+using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<AElementwiseOperation,
-                                                             AElementwiseOperation,
                                                             BElementwiseOperation,
                                                             CElementwiseOperation,
                                                             DxsInElementwiseOperation,
-                                                             DxsAccElementwiseOperation>>;
+                                                             DxsReduceAccElementwiseOperation>>;
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename DxsInElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation>
+struct DeviceGemmBiasAddReduce : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        const void* p_c0,
+                        const void* p_c1,
+                        void* p_dxs,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t StrideC1,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op,
+                        C1ElementwiseOperation c1_element_op,
+                        DxsInElementwiseOperation dxs_in_element_op,
+                        DxsReduceAccElementwiseOperation dxs_out_element_op,
+                        ck::index_t BatchCount = 1) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename DxsInElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation>
+using DeviceGemmBiasAddReducePtr =
+    std::unique_ptr<DeviceGemmBiasAddReduce<AElementwiseOperation,
+                                            BElementwiseOperation,
+                                            CElementwiseOperation,
+                                            C1ElementwiseOperation,
+                                            DxsInElementwiseOperation,
+                                            DxsReduceAccElementwiseOperation>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -32,7 +32,7 @@ template <typename ALayout,
          typename CElementwiseOperation,
          typename DxsReduceOperation,
          typename DxsInElementwiseOperation,
-          typename DxsAccElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation,
          typename DGlobalMemoryDataOperation,
          GemmSpecialization GemmSpec,
          index_t NumGemmKPrefetchStage,
@@ -68,12 +68,11 @@ template <typename ALayout,
          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
-struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
+struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOperation,
-                                                               AElementwiseOperation,
                                                               BElementwiseOperation,
                                                               CElementwiseOperation,
                                                               DxsInElementwiseOperation,
-                                                               DxsAccElementwiseOperation>
+                                                               DxsReduceAccElementwiseOperation>
 {
    using DeviceOp = DeviceGemmReduce_Xdl_CShuffle;
@@ -389,7 +388,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
        CElementwiseOperation,
        DxsReduceOperation,
        DxsInElementwiseOperation,
-        DxsAccElementwiseOperation,
+        DxsReduceAccElementwiseOperation,
        InMemoryDataOperationEnum::Set,
        DGlobalMemoryDataOperation,
        AGridDesc_AK0_M_AK1,
@@ -449,7 +448,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op,
                 DxsInElementwiseOperation dxs_in_element_op,
-                 DxsAccElementwiseOperation dxs_out_element_op)
+                 DxsReduceAccElementwiseOperation dxs_out_element_op)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
@@ -498,7 +497,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
        BElementwiseOperation b_element_op_;
        CElementwiseOperation c_element_op_;
        DxsInElementwiseOperation dxs_in_element_op_;
-        DxsAccElementwiseOperation dxs_out_element_op_;
+        DxsReduceAccElementwiseOperation dxs_out_element_op_;
    };
    // Invoker
@@ -554,7 +553,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                    BElementwiseOperation,
                    CElementwiseOperation,
                    DxsInElementwiseOperation,
-                    DxsAccElementwiseOperation,
+                    DxsReduceAccElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -594,7 +593,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                    BElementwiseOperation,
                    CElementwiseOperation,
                    DxsInElementwiseOperation,
-                    DxsAccElementwiseOperation,
+                    DxsReduceAccElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -669,7 +668,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
                             BElementwiseOperation b_element_op,
                             CElementwiseOperation c_element_op,
                             DxsInElementwiseOperation dxs_in_element_op,
-                             DxsAccElementwiseOperation dxs_out_element_op)
+                             DxsReduceAccElementwiseOperation dxs_out_element_op)
    {
        return Argument{p_a,
                        p_b,
@@ -691,27 +690,29 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
    static auto MakeInvoker() { return Invoker{}; }
    // polymorphic
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+    std::unique_ptr<BaseArgument>
-                                                      const void* p_b,
+    MakeArgumentPointer(const void* p_a,
-                                                      void* p_c,
+                        const void* p_b,
-                                                      DPtrsGlobal p_dxs,
+                        void* p_c,
-                                                      index_t MRaw,
+                        void* p_dxs,
-                                                      index_t NRaw,
+                        index_t MRaw,
-                                                      index_t KRaw,
+                        index_t NRaw,
-                                                      index_t StrideA,
+                        index_t KRaw,
-                                                      index_t StrideB,
+                        index_t StrideA,
-                                                      index_t StrideC,
+                        index_t StrideB,
-                                                      AElementwiseOperation a_element_op,
+                        index_t StrideC,
-                                                      BElementwiseOperation b_element_op,
+                        AElementwiseOperation a_element_op,
-                                                      CElementwiseOperation c_element_op,
+                        BElementwiseOperation b_element_op,
-                                                      DxsInElementwiseOperation dxs_in_element_op,
+                        CElementwiseOperation c_element_op,
-                                                      DxsAccElementwiseOperation dxs_out_element_op,
+                        DxsInElementwiseOperation dxs_in_element_op,
-                                                      index_t /* KBatch */ = 1) override
+                        DxsReduceAccElementwiseOperation dxs_out_element_op,
+                        index_t /* KBatch */ = 1) override
    {
+        DPtrsGlobal dxs_tuple = *(static_cast<DPtrsGlobal*>(p_dxs));
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          static_cast<CDataType*>(p_c),
-                                          p_dxs,
+                                          dxs_tuple,
                                          MRaw,
                                          NRaw,
                                          KRaw,

--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
@@ -362,7 +362,7 @@ struct DeviceGroupedGemmXdl
        {
            grid_size_ = 0;
-            gemm_descs_args_workspace_ = nullptr;
+            p_workspace_ = nullptr;
            group_count_ = ck::type_convert<ck::index_t>(gemm_shapes.size());
@@ -437,8 +437,6 @@ struct DeviceGroupedGemmXdl
        std::vector<GemmDescKernelArg> gemm_desc_kernel_arg_;
-        void* gemm_descs_args_workspace_;
        index_t grid_size_;
    };
@@ -488,7 +486,7 @@ struct DeviceGroupedGemmXdl
            }
            hipGetErrorString(
-                hipMemcpy(arg.gemm_descs_args_workspace_,
+                hipMemcpy(arg.p_workspace_,
                          arg.gemm_desc_kernel_arg_.data(),
                          arg.gemm_desc_kernel_arg_.size() * sizeof(GemmDescKernelArg),
                          hipMemcpyHostToDevice));
@@ -507,17 +505,17 @@ struct DeviceGroupedGemmXdl
                                                    CElementwiseOperation,
                                                    true>;
-                ave_time = launch_and_time_kernel(
+                ave_time =
-                    stream_config,
+                    launch_and_time_kernel(stream_config,
-                    kernel,
+                                           kernel,
-                    dim3(arg.grid_size_),
+                                           dim3(arg.grid_size_),
-                    dim3(BlockSize),
+                                           dim3(BlockSize),
-                    0,
+                                           0,
-                    cast_pointer_to_constant_address_space(arg.gemm_descs_args_workspace_),
+                                           cast_pointer_to_constant_address_space(arg.p_workspace_),
-                    arg.gemm_desc_kernel_arg_.size(),
+                                           arg.gemm_desc_kernel_arg_.size(),
-                    arg.a_element_op_,
+                                           arg.a_element_op_,
-                    arg.b_element_op_,
+                                           arg.b_element_op_,
-                    arg.c_element_op_);
+                                           arg.c_element_op_);
            }
            else
            {
@@ -531,17 +529,17 @@ struct DeviceGroupedGemmXdl
                                                    CElementwiseOperation,
                                                    false>;
-                ave_time = launch_and_time_kernel(
+                ave_time =
-                    stream_config,
+                    launch_and_time_kernel(stream_config,
-                    kernel,
+                                           kernel,
-                    dim3(arg.grid_size_),
+                                           dim3(arg.grid_size_),
-                    dim3(BlockSize),
+                                           dim3(BlockSize),
-                    0,
+                                           0,
-                    cast_pointer_to_constant_address_space(arg.gemm_descs_args_workspace_),
+                                           cast_pointer_to_constant_address_space(arg.p_workspace_),
-                    arg.gemm_desc_kernel_arg_.size(),
+                                           arg.gemm_desc_kernel_arg_.size(),
-                    arg.a_element_op_,
+                                           arg.a_element_op_,
-                    arg.b_element_op_,
+                                           arg.b_element_op_,
-                    arg.c_element_op_);
+                                           arg.c_element_op_);
            }
            return ave_time;
@@ -635,11 +633,6 @@ struct DeviceGroupedGemmXdl
    {
        return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmDescKernelArg);
    }
-    void SetWorkSpacePointer(BaseArgument* p_arg, void* workspace_ptr) const override
-    {
-        dynamic_cast<Argument*>(p_arg)->gemm_descs_args_workspace_ = workspace_ptr;
-    }
 };
 } // namespace device

--- a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -35,14 +35,13 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
    using IndexDataType = int32_t;
-    using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
    using InElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
    using AccElementwiseOperation =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
-            AccElementwiseOperation;
    static constexpr index_t InSrcOutDstVectorDim =
        0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is
@@ -178,13 +177,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
            invariant_lowest_length_ = C;
            reduce_lowest_length_    = window_spatial_lengths[1];
-            // TODO: is this correct?
+            int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];
-            if constexpr(ReduceOpId == ck::ReduceTensorOp::AVG)
-            {
+            std::tie(in_element_op_, acc_element_op_) =
-                ck::index_t divider = window_spatial_lengths[0] * window_spatial_lengths[1];
+                reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
-                in_element_op_      = InElementwiseOperation{divider};
-                acc_element_op_     = AccElementwiseOperation{divider};
-            }
        }
        const InDataType* p_in_dev_;

--- a/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
@@ -61,12 +61,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
    static constexpr bool use_multiblock =
        (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);
-    static constexpr bool out_type_compatible_with_atomic_op =
+    static_assert(ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
-        std::is_same<OutDataType, float>::value || std::is_same<OutDataType, double>::value;
+                                                                      OutDataType>::value,
+                  "The OutDataType must support the specified OutMemoryDataOperation!");
-    static_assert(
-        !use_multiblock || (use_multiblock && out_type_compatible_with_atomic_op),
-        "The OutDataType must support the atomic operation for using MultiBlock reduction");
    static_assert(!use_multiblock || (use_multiblock && !OutputIndex),
                  "MultiBlock reduction can only be used when outputing index is not required");
@@ -349,7 +346,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
            if constexpr(use_multiblock)
            {
                const auto identityVal =
-                    ck::reduce::GetIdentityValueueForInMemoryDataOperation<OutDataType>(
+                    ck::reduce::GetIdentityValueForInMemoryDataOperation<OutDataType>(
                        OutMemoryDataOperation);
                const auto kernel_pre =
@@ -393,10 +390,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
        };
    };
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    static bool IsSupportedArgument(const Argument* pArg)
    {
-        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
        if constexpr(use_multiblock)
        {
            if(static_cast<float>(pArg->beta_) != 0.0f)
@@ -445,11 +440,16 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
        else
        {
            // cases with very small reduce_total_length should be handled by ThreadWise kernel
-            if(pArg->reduce_total_length / KThreadSliceSize < 2)
+            // if(pArg->reduce_total_length / KThreadSliceSize < 2)
-                return (false);
+            //     return (false);
        };
        return (true);
+    }
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(dynamic_cast<const Argument*>(p_arg));
    };
    std::unique_ptr<BaseArgument>
@@ -492,7 +492,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceReduceMultiBlockAtomicAdd<" << BlockSize << ",";
+        str << (OutMemoryDataOperation == InMemoryDataOperationEnum::Set? "DeviceReduceBlockWise<" : "DeviceReduceMultiBlock<") << BlockSize << ",";
        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";

--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+#ifndef DEVICE_SOFTMAX_HPP
+#define DEVICE_SOFTMAX_HPP
+#include <iostream>
+#include <sstream>
+#include "device.hpp"
+#include "device_base.hpp"
+#include "device_reduce.hpp"
+#include "device_reduce_multiblock.hpp"
+#include "device_reduce_common.hpp"
+#include "gridwise_softmax.hpp"
+#include "gridwise_set_buffer_value.hpp"
+#include "reduction_operator.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceSoftmax : public BaseOperator
+{
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+    // Used for freeloading of some handy functions from DeviceReduceMultiBlock
+    using Reduction = DeviceReduceMultiBlock<InDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             Rank,
+                                             NumReduceDim,
+                                             reduce::Add,
+                                             PassThrough, // InElementwiseOperation
+                                             PassThrough, // AccElementwiseOperation
+                                             InMemoryDataOperationEnum::Set,
+                                             false, // PropagateNan
+                                             false, // OutputIndex
+                                             false, // HaveIndexInputIfOutputIndex
+                                             BlockSize,
+                                             MThreadClusterSize,
+                                             KThreadClusterSize,
+                                             MThreadSliceSize,
+                                             KThreadSliceSize,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1>; // OutDstVectorSize
+    using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
+    using GridwiseReduce = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                    OutDataType,
+                                                    AccDataType,
+                                                    GridDesc_M_K,
+                                                    BlockSize,
+                                                    MThreadClusterSize,
+                                                    KThreadClusterSize,
+                                                    MThreadSliceSize,
+                                                    KThreadSliceSize,
+                                                    InSrcVectorDim,
+                                                    InSrcVectorSize,
+                                                    OutDstVectorSize>;
+    struct Argument : public Reduction::Argument
+    {
+        Argument(const std::vector<index_t> inLengths,
+                 const std::vector<index_t> inStrides,
+                 const std::vector<index_t> reduceDims,
+                 AccDataType alpha,
+                 AccDataType beta,
+                 const InDataType* in_dev,
+                 OutDataType* out_dev)
+            : Reduction::Argument(inLengths,
+                                  inStrides,
+                                  {},
+                                  {},
+                                  reduceDims,
+                                  0.0f, // alpha
+                                  0.0f, // beta
+                                  in_dev,
+                                  nullptr,
+                                  out_dev,
+                                  nullptr,
+                                  PassThrough{},
+                                  PassThrough{}),
+              // FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
+              // float32 precision. Make it support any data type so the fields can be removed.
+              alpha_(alpha),
+              beta_(beta)
+        {
+            // std::cout << "blkGroupSize= " << this->blkGroupSize
+            //           << ", numBlockTileIteration= " << this->numBlockTileIteration
+            //           << ", gridSize=" << this->gridSize
+            //           << ", invariant_total_length=" << this->invariant_total_length <<
+            //           std::endl;
+        }
+        AccDataType alpha_;
+        AccDataType beta_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto in_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto kernel_main =
+                kernel_softmax<GridwiseReduce, InDataType, OutDataType, AccDataType, GridDesc_M_K>;
+            float avg_time = 0;
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               in_grid_desc_m_k,
+                                               out_grid_desc_m_k,
+                                               arg.blkGroupSize,
+                                               arg.numBlockTileIteration,
+                                               arg.alpha_,
+                                               arg.in_dev_,
+                                               arg.beta_,
+                                               arg.out_dev_);
+            return (avg_time);
+        };
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+        if(!Reduction::IsSupportedArgument(p_arg_))
+        {
+            return false;
+        }
+        if(p_arg_->inLengths_[Rank - 1] % OutDstVectorSize != 0)
+        {
+            return false;
+        }
+        return true;
+    };
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
+                                                      const std::vector<index_t> inStrides,
+                                                      const std::vector<int> reduceDims,
+                                                      AccDataType alpha,
+                                                      AccDataType beta,
+                                                      const void* in_dev,
+                                                      void* out_dev)
+    {
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          reduceDims,
+                                          alpha,
+                                          beta,
+                                          static_cast<const InDataType*>(in_dev),
+                                          static_cast<OutDataType*>(out_dev));
+    };
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); };
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceReduceSoftmax<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif // DEVICE_SOFTMAX_HPP
--- a/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device.hpp"
+#include "device_base.hpp"
+#include "gridwise_unary_elementwise_1d.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename ADataType,
+          typename BDataType,
+          typename ElementwiseFunctor,
+          index_t Dim,
+          index_t ScalarPerVector>
+struct DeviceUnaryElementwise : public BaseOperator
+{
+    static constexpr auto I0 = Number<0>{};
+    template <typename Desc_M0>
+    static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
+    {
+        const auto m0           = desc_m0.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * ScalarPerVector;
+        const auto pad          = math::integer_least_multiple(m0, loop_step) - m0;
+        const auto desc_m0_pad =
+            transform_tensor_descriptor(desc_m0,
+                                        make_tuple(make_right_pad_transform(m0, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m0_pad;
+    }
+    static auto MakeDescriptor_M0(const std::vector<index_t>& shape,
+                                  const std::vector<index_t>& stride,
+                                  index_t gridSize,
+                                  index_t blockSize)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+        // merge nd to 1d desc - [s0 * s1 * ...]
+        if constexpr(Dim > 1)
+        {
+            const auto desc_m0 = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(tupleOfShape)),
+                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
+                make_tuple(Sequence<0>{}));
+            return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
+        }
+        else
+            return PadDescriptor_M0_1d(desc, gridSize, blockSize);
+    }
+    using GridDesc_M0      = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
+    using GridwiseUEltwise = GridwiseUnaryElementwise_1D<ADataType,
+                                                         BDataType,
+                                                         GridDesc_M0,
+                                                         ElementwiseFunctor,
+                                                         ScalarPerVector>;
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a,
+                 BDataType* p_b,
+                 const std::vector<index_t>& shape,
+                 const std::vector<index_t>& stride_a,
+                 const std::vector<index_t>& stride_b,
+                 ElementwiseFunctor functor)
+            : p_a_(p_a),
+              p_b_(p_b),
+              shape_(shape),
+              functor_(functor),
+              blockSize_(256) // FIXME - Calculate the grid size by number of CU in the future
+        {
+            index_t tensor_size =
+                std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
+            gridSize_       = GridwiseUEltwise::CalculateGridSize(tensor_size);
+            a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
+            b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
+        }
+        const ADataType* p_a_;
+        BDataType* p_b_;
+        std::vector<int> shape_;
+        GridDesc_M0 a_grid_desc_m0_;
+        GridDesc_M0 b_grid_desc_m0_;
+        ElementwiseFunctor functor_;
+        index_t blockSize_;
+        index_t gridSize_;
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto kernel = kernel_unary_elementwise_1d<GridwiseUEltwise,
+                                                            ADataType,
+                                                            BDataType,
+                                                            GridDesc_M0,
+                                                            ElementwiseFunctor>;
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(arg.gridSize_),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        arg.p_a_,
+                                                        arg.p_b_,
+                                                        arg.a_grid_desc_m0_,
+                                                        arg.b_grid_desc_m0_,
+                                                        arg.functor_);
+            return elapsed_time;
+        }
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+        if(pArg == nullptr)
+            return false;
+        if(pArg->shape_.back() % ScalarPerVector != 0)
+            return false;
+        return true;
+    };
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      void* p_b,
+                                                      std::vector<index_t> shape,
+                                                      std::vector<index_t> stride_a,
+                                                      std::vector<index_t> stride_b,
+                                                      ElementwiseFunctor functor)
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<BDataType*>(p_b),
+                                          shape,
+                                          stride_a,
+                                          stride_b,
+                                          functor);
+    }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceBinaryElementwise"
+            << "<"
+            << "ScalarPerVector = " << ScalarPerVector
+            << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
+++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
@@ -29,6 +29,7 @@
 #include "reduction_operator.hpp"
 #include "reduction_enums.hpp"
 #include "element_wise_operation.hpp"
+#include <tuple>
 namespace ck {
@@ -37,77 +38,69 @@ namespace ck {
 // The boolean member "indexable" are also provided in reduce_binary_operactor for
 // easier checking by the upper-layer codes in the kernels.
-template <typename T, ReduceTensorOp Op>
+template <ReduceTensorOp Op>
 struct reduce_binary_operator;
-template <typename T>
+template <>
-struct reduce_binary_operator<T, ReduceTensorOp::ADD>
+struct reduce_binary_operator<ReduceTensorOp::ADD>
 {
-    using opType   = reduce::Add<T>;
+    using opType = reduce::Add;
-    using dataType = T;
    static constexpr bool indexable = false;
 };
-template <typename T>
+template <>
-struct reduce_binary_operator<T, ReduceTensorOp::MUL>
+struct reduce_binary_operator<ReduceTensorOp::MUL>
 {
-    using opType   = reduce::Mul<T>;
+    using opType = reduce::Mul;
-    using dataType = T;
    static constexpr bool indexable = false;
 };
-template <typename T>
+template <>
-struct reduce_binary_operator<T, ReduceTensorOp::MIN>
+struct reduce_binary_operator<ReduceTensorOp::MIN>
 {
-    using opType   = reduce::Min<T>;
+    using opType = reduce::Min;
-    using dataType = T;
    static constexpr bool indexable = true;
 };
-template <typename T>
+template <>
-struct reduce_binary_operator<T, ReduceTensorOp::MAX>
+struct reduce_binary_operator<ReduceTensorOp::MAX>
 {
-    using opType   = reduce::Max<T>;
+    using opType = reduce::Max;
-    using dataType = T;
    static constexpr bool indexable = true;
 };
-template <typename T>
+template <>
-struct reduce_binary_operator<T, ReduceTensorOp::AMAX>
+struct reduce_binary_operator<ReduceTensorOp::AMAX>
 {
-    using opType   = reduce::AMax<T>;
+    using opType = reduce::AMax;
-    using dataType = T;
    static constexpr bool indexable = true;
 };
-template <typename T>
+template <>
-struct reduce_binary_operator<T, ReduceTensorOp::AVG>
+struct reduce_binary_operator<ReduceTensorOp::AVG>
 {
-    using opType   = reduce::Add<T>;
+    using opType = reduce::Add;
-    using dataType = T;
    static constexpr bool indexable = false;
 };
-template <typename T>
+template <>
-struct reduce_binary_operator<T, ReduceTensorOp::NORM1>
+struct reduce_binary_operator<ReduceTensorOp::NORM1>
 {
-    using opType   = reduce::Add<T>;
+    using opType = reduce::Add;
-    using dataType = T;
    static constexpr bool indexable = false;
 };
-template <typename T>
+template <>
-struct reduce_binary_operator<T, ReduceTensorOp::NORM2>
+struct reduce_binary_operator<ReduceTensorOp::NORM2>
 {
-    using opType   = reduce::Add<T>;
+    using opType = reduce::Add;
-    using dataType = T;
    static constexpr bool indexable = false;
 };
@@ -115,53 +108,101 @@ struct reduce_binary_operator<T, ReduceTensorOp::NORM2>
 // The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
 // functor classes.
 // The two unary functors are called before and afer the Reduction is executed respectively
-template <typename T, ReduceTensorOp Op, bool IsFirstReduce, bool IsLastReduce>
+template <ReduceTensorOp Op, bool IsFirstReduce, bool IsLastReduce>
 struct reduce_unary_operator
 {
-    using InElementwiseOperation  = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using InElementwiseOperation  = tensor_operation::element_wise::PassThrough;
-    using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
 };
-template <typename T, bool IsFirstReduce>
+template <bool IsFirstReduce>
-struct reduce_unary_operator<T, ReduceTensorOp::AVG, IsFirstReduce, true>
+struct reduce_unary_operator<ReduceTensorOp::AVG, IsFirstReduce, true>
 {
-    using InElementwiseOperation  = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using InElementwiseOperation  = tensor_operation::element_wise::PassThrough;
-    using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T, true>;
+    using AccElementwiseOperation = tensor_operation::element_wise::UnaryDivide;
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{reduceLength});
+    };
 };
-template <typename T, bool IsLastReduce>
+template <bool IsLastReduce>
-struct reduce_unary_operator<T, ReduceTensorOp::NORM1, true, IsLastReduce>
+struct reduce_unary_operator<ReduceTensorOp::NORM1, true, IsLastReduce>
 {
-    using InElementwiseOperation  = tensor_operation::element_wise::UnaryAbs<T, T>;
+    using InElementwiseOperation  = tensor_operation::element_wise::UnaryAbs;
-    using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
 };
-template <typename T, bool IsLastReduce>
+template <bool IsLastReduce>
-struct reduce_unary_operator<T, ReduceTensorOp::AMAX, true, IsLastReduce>
+struct reduce_unary_operator<ReduceTensorOp::AMAX, true, IsLastReduce>
 {
-    using InElementwiseOperation  = tensor_operation::element_wise::UnaryAbs<T, T>;
+    using InElementwiseOperation  = tensor_operation::element_wise::UnaryAbs;
-    using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
 };
-template <typename T>
+template <>
-struct reduce_unary_operator<T, ReduceTensorOp::NORM2, true, false>
+struct reduce_unary_operator<ReduceTensorOp::NORM2, true, false>
 {
-    using InElementwiseOperation  = tensor_operation::element_wise::UnarySquare<T, T>;
+    using InElementwiseOperation  = tensor_operation::element_wise::UnarySquare;
-    using AccElementwiseOperation = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
 };
-template <typename T>
+template <>
-struct reduce_unary_operator<T, ReduceTensorOp::NORM2, true, true>
+struct reduce_unary_operator<ReduceTensorOp::NORM2, true, true>
 {
-    using InElementwiseOperation  = tensor_operation::element_wise::UnarySquare<T, T>;
+    using InElementwiseOperation  = tensor_operation::element_wise::UnarySquare;
-    using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt<T, T>;
+    using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt;
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
 };
-template <typename T>
+template <>
-struct reduce_unary_operator<T, ReduceTensorOp::NORM2, false, true>
+struct reduce_unary_operator<ReduceTensorOp::NORM2, false, true>
 {
-    using InElementwiseOperation  = tensor_operation::element_wise::UnaryIdentic<T, T>;
+    using InElementwiseOperation  = tensor_operation::element_wise::PassThrough;
-    using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt<T, T>;
+    using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt;
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
 };
 } // end of namespace ck

--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -24,104 +24,192 @@
 *
 *******************************************************************************/
 #pragma once
 #include "data_type.hpp"
 namespace ck {
 namespace tensor_operation {
-namespace binary_element_wise {
+namespace element_wise {
-template <typename Y, typename X1, typename X2>
-struct Add;
-template <>
+struct Add
-struct Add<double, double, double>
 {
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+    template <>
    __host__ __device__ constexpr void
-    operator()(double& dst, const double& src1, const double& src2) const
+    operator()<float>(float& y, const float& x0, const float& x1) const
    {
-        dst = src1 + src2;
+        y = x0 + x1;
-    }
+    };
-};
-template <>
+    template <>
-struct Add<float, float, float>
-{
    __host__ __device__ constexpr void
-    operator()(float& dst, const float& src1, const float& src2) const
+    operator()<double>(double& y, const double& x0, const double& x1) const
    {
-        dst = src1 + src2;
+        y = x0 + x1;
-    }
+    };
-};
-template <>
+    // Question: should half_t be supported ?
-struct Add<half_t, half_t, half_t>
+    template <>
-{
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        y = x0 + x1;
+    };
+    // Question: should bhalf_t be supported ?
+    template <>
    __host__ __device__ constexpr void
-    operator()(half_t& dst, const half_t& src1, const half_t& src2) const
+    operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
    {
-        dst = src1 + src2;
+        const float x1_tmp = ck::type_convert<float>(x0);
+        const float x2_tmp = ck::type_convert<float>(x1);
+        const float y_tmp  = x1_tmp + x2_tmp;
+        y                  = ck::type_convert<bhalf_t>(y_tmp);
    }
 };
-template <>
+struct Subtract
-struct Add<bhalf_t, bhalf_t, bhalf_t>
 {
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+    template <>
    __host__ __device__ constexpr void
-    operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const
+    operator()<float>(float& y, const float& x0, const float& x1) const
    {
-        const float x1 = ck::type_convert<float>(src1);
+        y = x0 - x1;
-        const float x2 = ck::type_convert<float>(src2);
+    };
-        const float y  = x1 + x2;
-        dst            = ck::type_convert<bhalf_t>(y);
-    }
-};
-template <typename Y, typename X1, typename X2>
+    template <>
-struct Substract;
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        y = x0 - x1;
+    };
-template <>
+    // Question: should half_t be supported ?
-struct Substract<double, double, double>
+    template <>
-{
    __host__ __device__ constexpr void
-    operator()(double& dst, const double& src1, const double& src2) const
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
    {
-        dst = src1 - src2;
+        y = x0 - x1;
+    };
+    // Question: should bhalf_t be supported ?
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
+    {
+        const float x1_tmp = ck::type_convert<float>(x0);
+        const float x2_tmp = ck::type_convert<float>(x1);
+        const float y_tmp  = x1_tmp - x2_tmp;
+        y                  = ck::type_convert<bhalf_t>(y_tmp);
    }
 };
-template <>
+struct AlphaBetaAdd
-struct Substract<float, float, float>
 {
+    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+    template <>
    __host__ __device__ constexpr void
-    operator()(float& dst, const float& src1, const float& src2) const
+    operator()<float>(float& y, const float& x0, const float& x1) const
    {
-        dst = src1 - src2;
+        y = alpha_ * x0 + beta_ * x1;
-    }
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        y = static_cast<double>(alpha_) * x0 + static_cast<double>(beta_) * x1;
+    };
+    // Question: should half_t be supported ?
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        y = static_cast<half_t>(alpha_ * static_cast<float>(x0) + beta_ * static_cast<float>(x1));
+    };
+    float alpha_;
+    float beta_;
 };
-template <>
+struct AddRelu
-struct Substract<half_t, half_t, half_t>
 {
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+    template <>
    __host__ __device__ constexpr void
-    operator()(half_t& dst, const half_t& src1, const half_t& src2) const
+    operator()<float>(float& y, const float& x0, const float& x1) const
    {
-        dst = src1 - src2;
+        const float a = x0 + x1;
-    }
+        y             = a > 0.0f ? a : 0.0f;
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        const double a = x0 + x1;
+        y              = a > 0.0 ? a : 0.0;
+    };
+    // Question: should half_t be supported ?
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        const half_t a = x0 + x1;
+        y              = a > static_cast<half_t>(0.0f) ? a : static_cast<half_t>(0.0f);
+    };
 };
-template <>
+struct AddHardswish
-struct Substract<bhalf_t, bhalf_t, bhalf_t>
 {
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+    template <>
    __host__ __device__ constexpr void
-    operator()(bhalf_t& dst, const bhalf_t& src1, const bhalf_t& src2) const
+    operator()<float>(float& y, const float& x0, const float& x1) const
    {
-        const float x1 = ck::type_convert<float>(src1);
+        float a = x0 + x1;
-        const float x2 = ck::type_convert<float>(src2);
+        float b = a + float{3};
-        const float y  = x1 - x2;
+        float c = (b > 0) * (b > 6.0f ? 6.0f : b) * a * 0.166667f;
-        dst            = ck::type_convert<bhalf_t>(y);
+        y       = c;
-    }
+    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        double a = x0 + x1;
+        double b = a + 3.0;
+        double c = (b > 0) * (b > 6.0 ? 6.0 : b) * a * 0.166667;
+        y        = c;
+    };
+    // Question: should half_t be supported ?
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        float a = x0 + x1;
+        float b = a + 3.0f;
+        float c = (b > 0) * (b > 6.0f ? 6.0f : b) * a * 0.166667f;
+        y       = c;
+    };
 };
-} // namespace binary_element_wise
+} // namespace element_wise
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
 #pragma once
 #include "data_type.hpp"
 #include "math_v2.hpp"
+#include "unary_element_wise_operation.hpp"
+#include "binary_element_wise_operation.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace element_wise {
-struct PassThrough
+// Need to ensure compiler will fail if there is no matching candidate, instead of compiler
-{
+// siliently do implicit type conversion
-    __host__ __device__ void operator()(float& y, const float& x) const { y = x; }
+//
+// Method 1:
-    __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; }
+//
+// struct ExampleElementwiseOp
-    __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const { y = x; }
+// {
+//     template<typename Y, typename X>
-    __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; }
+//     __host__ __device__ constexpr void
+//     operator()(Y&, const X) const;
-    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; }
+//
+//     template<>
-    __host__ __device__ void operator()(double& y, const double& x) const { y = x; }
+//     __host__ __device__ constexpr void
-};
+//     operator()<half_t, half_t>(half_t& y, const half_t& x) const
+//     {
-struct Add
+//     }
-{
+// };
-    __host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const
+//
-    {
+// Method 2:
-        y = x0 + x1;
+//
-    }
+// template <typename Y, typename X>
+// struct ExampleElementwiseOp;
-    __host__ __device__ constexpr void
+//
-    operator()(half_t& y, const half_t& x0, const half_t& x1) const
+// template <>
-    {
+// struct ExampleElementwiseOp<float, ck::bhalf_t>
-        // FIXME - Use float (acc type) bias in the future.
+// {
-        y = x0 + x1;
+//     __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
-    }
+//     {
-};
+//     }
+// };
-struct AlphaBetaAdd
-{
-    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta) {}
-    __host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const
-    {
-        y = alpha_ * x0 + beta_ * x1;
-    }
-    __host__ __device__ constexpr void
-    operator()(half_t& y, const half_t& x0, const half_t& x1) const
-    {
-        // FIXME - Let x0 be acc type
-        y = static_cast<half_t>(alpha_ * static_cast<float>(x0) + beta_ * static_cast<float>(x1));
-    }
-    float alpha_;
-    float beta_;
-};
-struct AddRelu
-{
-    __host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const
-    {
-        const float a = x0 + x1;
-        y             = a > 0 ? a : 0;
-    }
-    __host__ __device__ constexpr void
-    operator()(half_t& y, const half_t& x0, const half_t& x1) const
-    {
-        const half_t a = x0 + x1;
-        y              = a > 0 ? a : 0;
-    }
-};
-struct AddHardswish
-{
-    __host__ __device__ constexpr void operator()(float& y, const float& x0, const float& x1) const
-    {
-        float a = x0 + x1;
-        float b = a + float{3};
-        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
-        y       = c;
-    }
-    __host__ __device__ constexpr void
-    operator()(half_t& y, const half_t& x0, const half_t& x1) const
-    {
-        float a = x0 + x1;
-        float b = a + float{3};
-        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
-        y       = c;
-    }
-};
 struct AddReluAdd
 {
-    __host__ __device__ constexpr void
+    template <typename Y, typename X0, typename X1, typename X2>
-    operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
+    __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const;
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
+        half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
    {
        half_t a = x0 + x1;
        half_t b = a > 0 ? a : 0;
        y        = b + x2;
    }
-    __host__ __device__ constexpr void
+    template <>
-    operator()(float& y, const float& x0, const float& x1, const float& x2) const
+    __host__ __device__ constexpr void operator()<float, float, float, float>(float& y,
+                                                                              const float& x0,
+                                                                              const float& x1,
+                                                                              const float& x2) const
    {
        float a = x0 + x1;
        float b = a > 0 ? a : 0;
@@ -111,8 +66,9 @@ struct AddReluAdd
        y       = c;
    }
-    __host__ __device__ constexpr void
+    template <>
-    operator()(half_t& y, const float& x0, const half_t& x1, const half_t& x2) const
+    __host__ __device__ constexpr void operator()<half_t, float, half_t, half_t>(
+        half_t& y, const float& x0, const half_t& x1, const half_t& x2) const
    {
        float a = x0 + x1;
        float b = a > 0 ? a : 0;
@@ -123,8 +79,14 @@ struct AddReluAdd
 struct AddHardswishAdd
 {
-    __host__ __device__ constexpr void
+    template <typename Y, typename X0, typename X1, typename X2>
-    operator()(float& y, const float& x0, const float& x1, const float& x2) const
+    __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const;
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float>(float& y,
+                                                                              const float& x0,
+                                                                              const float& x1,
+                                                                              const float& x2) const
    {
        float a = x0 + x1;
        float b = a + float{3};
@@ -133,8 +95,9 @@ struct AddHardswishAdd
        y       = d;
    }
-    __host__ __device__ constexpr void
+    template <>
-    operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
+    __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
+        half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
    {
        float a = x0 + x1;
        float b = a + float{3};
@@ -144,206 +107,95 @@ struct AddHardswishAdd
    }
 };
-struct Normalize
+// C = A * B
+// E = FastGelu(C + D0 + D1)
+struct AddAddFastGelu
 {
-    Normalize(float epsilon = 1e-4) : epsilon_(epsilon) {}
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ void operator()(E&, const C&, const D0&, const D1&) const;
-    __host__ __device__ constexpr void operator()(float& y,
+    template <>
-                                                  const float& x,
+    __host__ __device__ void operator()<half_t, float, half_t, half_t>(half_t& e,
-                                                  const float& mean,
+                                                                       const float& c,
-                                                  const float& mean_square,
+                                                                       const half_t& d0,
-                                                  const float& gamma,
+                                                                       const half_t& d1) const
-                                                  const float& beta) const
    {
-        float variance = mean_square - (mean * mean);
+        // Fast GeLU
-        y              = ((x - mean) / sqrtf(variance + epsilon_)) * gamma + beta;
+        // https://paperswithcode.com/method/gelu
-    }
+        // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+        const auto fast_gelu = [&](float x) {
-    float epsilon_;
+            const float u   = float(2) * x * (float(0.035677) * x * x + float(0.797885));
-};
+            const float emu = exp(-u);
+            const float cdf = float(0.5) + float(0.5) * (float(2) / (float(1) + emu) - float(1));
+            return x * cdf;
+        };
-// Unary operators are usually called element-wisely before/after the reduction is executed on the
+        const float y = fast_gelu(c + float(d0) + float(d1));
-// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
-template <typename Y, typename X, bool HasDividing = false>
+        e = type_convert<half_t>(y);
-struct UnaryIdentic;
+    }
-template <>
-struct UnaryIdentic<float, float, false>
-{
-    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(float& y, const float& x) const { y = x; };
-};
-template <>
-struct UnaryIdentic<float, float, true>
-{
-    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
-    __host__ __device__ void operator()(float& y, const float& x) const
-    {
-        y = x / type_convert<float>(divider_);
-    };
-    int32_t divider_ = 1;
-};
-template <>
-struct UnaryIdentic<half_t, half_t, false>
-{
-    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = x; };
 };
-template <>
+struct Normalize
-struct UnaryIdentic<double, double, false>
 {
-    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
+    // FIXME: is double absolutely necessary?
+    Normalize(double epsilon = 1e-4) : epsilon_(epsilon) {}
-    __host__ __device__ void operator()(double& y, const double& x) const { y = x; };
+    template <typename T>
-};
+    __host__ __device__ constexpr void operator()(
+        T& y, const T& x, const T& mean, const T& mean_square, const T& gamma, const T& beta) const;
-template <>
-struct UnaryIdentic<double, double, true>
-{
-    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
-    __host__ __device__ void operator()(double& y, const double& x) const
+    template <>
+    __host__ __device__ constexpr void operator()<float>(float& y,
+                                                         const float& x,
+                                                         const float& mean,
+                                                         const float& mean_square,
+                                                         const float& gamma,
+                                                         const float& beta) const
    {
-        y = x / type_convert<double>(divider_);
+        using ck::math::sqrt;
-    };
-    int32_t divider_ = 1;
-};
-template <>
-struct UnaryIdentic<int32_t, int32_t, false>
-{
-    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x; };
-};
-template <>
-struct UnaryIdentic<int32_t, int32_t, true>
-{
-    __host__ __device__ UnaryIdentic(const int32_t divider = 1) { divider_ = divider; };
-    __host__ __device__ void operator()(int32_t& y, const int32_t& x) const { y = x / divider_; };
-    int32_t divider_ = 1;
-};
-template <>
-struct UnaryIdentic<int8_t, int8_t, false>
-{
-    __host__ __device__ UnaryIdentic(const int8_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = x; };
-};
-template <typename Y, typename X, bool HasDividing = false>
-struct UnarySquare;
-template <>
-struct UnarySquare<float, float, false>
-{
-    __host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(float& y, const float& x) const { y = x * x; };
+        float variance = mean_square - (mean * mean);
-};
+        y = ((x - mean) / sqrt(variance + static_cast<float>(epsilon_))) * gamma + beta;
-template <>
-struct UnarySquare<float, float, true>
-{
-    __host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; };
-    __host__ __device__ void operator()(float& y, const float& x) const
-    {
-        y = x * x / type_convert<float>(divider_);
    };
-    int32_t divider_ = 1;
+    template <>
-};
+    __host__ __device__ constexpr void operator()<double>(double& y,
+                                                          const double& x,
-template <>
+                                                          const double& mean,
-struct UnarySquare<double, double, false>
+                                                          const double& mean_square,
-{
+                                                          const double& gamma,
-    __host__ __device__ UnarySquare(const int32_t divider = 1) { (void)divider; };
+                                                          const double& beta) const
-    __host__ __device__ void operator()(double& y, const double& x) const { y = x * x; };
-};
-template <>
-struct UnarySquare<double, double, true>
-{
-    __host__ __device__ UnarySquare(const int32_t divider = 1) { divider_ = divider; };
-    __host__ __device__ void operator()(double& y, const double& x) const
    {
-        y = x * x / type_convert<double>(divider_);
+        using ck::math::sqrt;
-    };
-    int32_t divider_ = 1;
+        double variance = mean_square - (mean * mean);
-};
+        y               = ((x - mean) / sqrt(variance + epsilon_)) * gamma + beta;
+    };
-template <typename Y, typename X>
-struct UnaryAbs;
-template <>
-struct UnaryAbs<float, float>
-{
-    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(float& y, const float& x) const { y = ck::math::abs(x); };
-};
-template <>
-struct UnaryAbs<half_t, half_t>
-{
-    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(half_t& y, const half_t& x) const { y = ck::math::abs(x); };
-};
-template <>
-struct UnaryAbs<double, double>
-{
-    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(double& y, const double& x) const { y = ck::math::abs(x); };
-};
-template <>
-struct UnaryAbs<int8_t, int8_t>
-{
-    __host__ __device__ UnaryAbs(const int32_t divider = 1) { (void)divider; };
-    __host__ __device__ void operator()(int8_t& y, const int8_t& x) const { y = ck::math::abs(x); };
+    // FIXME: is double absolutely necessary?
+    double epsilon_;
 };
 template <typename Y, typename X>
-struct UnarySqrt;
+struct UnaryTypeConvert;
 template <>
-struct UnarySqrt<float, float>
+struct UnaryTypeConvert<float, ck::bhalf_t>
 {
-    __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
+    __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
+    {
-    __host__ __device__ void operator()(float& y, const float& x) const { y = ck::math::sqrt(x); };
+        y = ck::type_convert<float, ck::bhalf_t>(x);
+    }
 };
 template <>
-struct UnarySqrt<double, double>
+struct UnaryTypeConvert<ck::bhalf_t, float>
 {
-    __host__ __device__ UnarySqrt(const int32_t divider = 1) { (void)divider; };
+    __host__ __device__ void operator()(ck::bhalf_t& y, float& x) const
-    __host__ __device__ void operator()(double& y, const double& x) const
    {
-        y = ck::math::sqrt(x);
+        y = ck::type_convert<ck::bhalf_t, float>(x);
-    };
+    }
 };
 } // namespace element_wise

--- a/include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
-#pragma once
-#include "data_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace element_wise {
-} // namespace element_wise
-} // namespace tensor_operation
-} // namespace ck
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+#pragma once
+#include "data_type.hpp"
+#include "math_v2.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+struct PassThrough
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, bhalf_t>::value ||
+                          is_same<T, int32_t>::value || is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x;
+    };
+};
+struct UnaryDivide
+{
+    __host__ __device__ UnaryDivide(const int32_t divider = 1) : divider_(divider){};
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, int32_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x / type_convert<T>(divider_);
+    };
+    int32_t divider_ = 1;
+};
+struct UnarySquare
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+        y = x * x;
+    };
+};
+struct UnaryAbs
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = ck::math::abs(x);
+    };
+};
+struct UnarySqrt
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+        y = ck::math::sqrt(x);
+    };
+};
+struct Relu
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x > 0 ? x : 0;
+    }
+    template <>
+    __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
+    {
+        float x_f32 = ck::type_convert<float>(x);
+        float y_f32 = x_f32 > 0 ? x_f32 : 0;
+        y           = ck::type_convert<bhalf_t>(y_f32);
+    }
+};
+// https://paperswithcode.com/method/gelu
+// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+struct FastGelu
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+        const float u   = float(2) * x * (float(0.035677) * x * x + float(0.797885));
+        const float emu = exp(-u);
+        const float cdf = float(0.5) + float(0.5) * (float(2) / (float(1) + emu) - float(1));
+        y = x * cdf;
+    }
+};
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
@@ -171,15 +171,15 @@ struct GridwiseReduction_mk_to_m_multiblock
                               AccDataType beta,
                               OutDataType* const __restrict__ p_out_value_global)
    {
-        const auto identityVal = ReduceOperation::GetIdentityValue();
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
        // LDS
        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
-        const auto in_global_val_buf =
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+            p_in_value_global,
-                                                          in_grid_desc_m_k.GetElementSpaceSize(),
+            in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(identityVal));
+            ReduceOperation::template GetIdentityValue<InDataType>());
        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
@@ -358,12 +358,12 @@ struct GridwiseReduction_mk_to_m_multiblock
        __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
        __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];
-        const auto identityVal = ReduceOperation::GetIdentityValue();
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
-        const auto in_global_val_buf =
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+            p_in_value_global,
-                                                          in_grid_desc_m_k.GetElementSpaceSize(),
+            in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(identityVal));
+            ReduceOperation::template GetIdentityValue<InDataType>());
        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -135,12 +135,12 @@ struct GridwiseReduction_mk_to_m_threadwise
                                                     ReduceOperation,
                                                     PropagateNan>;
-        const auto identityVal = ReduceOperation::GetIdentityValue();
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
-        const auto in_global_val_buf =
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+            p_in_value_global,
-                                                          in_grid_desc_m_k.GetElementSpaceSize(),
+            in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(identityVal));
+            ReduceOperation::template GetIdentityValue<InDataType>());
        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
@@ -276,12 +276,12 @@ struct GridwiseReduction_mk_to_m_threadwise
        (void)acc_elementwise_op;
-        const auto identityVal = ReduceOperation::GetIdentityValue();
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
-        const auto in_global_val_buf =
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            make_dynamic_buffer<AddressSpaceEnum::Global>(p_in_value_global,
+            p_in_value_global,
-                                                          in_grid_desc_m_k.GetElementSpaceSize(),
+            in_grid_desc_m_k.GetElementSpaceSize(),
-                                                          type_convert<InDataType>(identityVal));
+            ReduceOperation::template GetIdentityValue<InDataType>());
        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+#pragma once
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "blockwise_gemm_xdlops.hpp"
+#include "thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "gridwise_gemm_pipeline_v1.hpp"
+#include "reduction_functions_threadwise.hpp"
+namespace ck {
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename FloatC0,
+          typename FloatC1,
+          typename DPtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename DxsInElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename DGridDescriptor_MBlock_MPerBlock,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_bias_add_reduce_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const FloatC0* __restrict__ p_c0_grid,
+            const FloatC1* __restrict__ p_c1_grid,
+            DPtrsGlobal p_ds_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const C1ElementwiseOperation c1_element_op,
+            const DxsInElementwiseOperation dxs_in_element_op,
+            const DxsReduceAccElementwiseOperation dxs_out_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c0_grid_desc_mblock_mperblock_nblock_nperblock,
+            const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c1_grid_desc_mblock_mperblock_nblock_nperblock,
+            const DGridDescriptor_MBlock_MPerBlock d_grid_desc_mblock_mperblock,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  p_c0_grid,
+                                                  p_c1_grid,
+                                                  p_ds_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  c1_element_op,
+                                                  dxs_in_element_op,
+                                                  dxs_out_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  d_grid_desc_mblock_mperblock,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_c0_grid;
+    ignore = p_c1_grid;
+    ignore = p_ds_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = c1_element_op;
+    ignore = dxs_in_element_op;
+    ignore = dxs_out_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = c0_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = c1_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = d_grid_desc_mblock_mperblock;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+template <typename FloatAB,
+          typename FloatGemmAcc,
+          typename FloatCShuffle,
+          typename FloatC,
+          typename FloatC0,
+          typename FloatC1,
+          typename FloatReduceAcc,
+          typename DPtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename DxsReduceOperation,
+          typename DxsInElementwiseOperation,
+          typename DxsReduceAccElementwiseOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename DGlobalMemoryDataOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDesc_M_N,
+          typename C0GridDesc_M_N,
+          typename C1GridDesc_M_N,
+          typename DGridDesc_M,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched>
+struct GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    // K1 should be Number<...>
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+    using GridwiseGemmPipe = GridwiseGemmPipeline_v1<NumGemmKPrefetchStage>;
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size * sizeof(FloatCShuffle));
+    }
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        // static_assert(is_known_at_compile_time<remove_cv_t<decltype(AK1)>>::value &&
+        //               is_known_at_compile_time<remove_cv_t<decltype(BK1)>>::value,
+        //               "wrong! K1 need to be known at compile-time");
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
+        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
+        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
+            return false;
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+            return false;
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+    template <typename CGridDesc_M_N_>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N_& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+    __host__ __device__ static constexpr auto
+    MakeDGridDescriptor_MBlock_MPerBlock(const DGridDesc_M& d_grid_desc_m)
+    {
+        const auto M      = d_grid_desc_m.GetLength(I0);
+        const auto MBlock = M / MPerBlock;
+        const auto d_grid_desc_mblock_mperblock = transform_tensor_descriptor(
+            d_grid_desc_m,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{}))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1>{}));
+        return d_grid_desc_mblock_mperblock;
+    }
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+    using C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(C0GridDesc_M_N{}))>;
+    using C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(C1GridDesc_M_N{}))>;
+    using DGridDescriptor_MBlock_MPerBlock =
+        remove_cvref_t<decltype(MakeDGridDescriptor_MBlock_MPerBlock(DGridDesc_M{}))>;
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               const FloatC0* __restrict__ p_c0_grid,
+                               const FloatC1* __restrict__ p_c1_grid,
+                               DPtrsGlobal p_ds_grid,
+                               void* __restrict__ p_shared,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const C1ElementwiseOperation& c1_element_op,
+                               const DxsInElementwiseOperation& dxs_in_element_op,
+                               const DxsReduceAccElementwiseOperation& dxs_out_element_op,
+                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const DGridDescriptor_MBlock_MPerBlock& d_grid_desc_mblock_mperblock,
+                               const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c0_grid, c0_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        auto c1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c1_grid, c1_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_v1_Selector<NumGemmKPrefetchStage, LoopSched>();
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+        // shuffle C + reduction + write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatGemmAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+            // TODO: this should be implemented as a blockwise reduction
+            // LDS c_reduce_block_desc_mperblock_nperblock
+            constexpr auto c_reduce_block_desc_mperblock_nperblock = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I1)),
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I3))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<>{}, Sequence<1>{}));
+            static_assert(CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0) *
+                                  CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                              BlockSize,
+                          "wrong!");
+            static_assert((CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) %
+                                      CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0) ==
+                                  0 &&
+                              (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) %
+                                      CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                                  0,
+                          "wrong!");
+            constexpr index_t mreduce_per_thread =
+                (CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) /
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0);
+            constexpr index_t nreduce_per_thread =
+                (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) /
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1);
+            constexpr auto c_reduce_thread_lengths_mperblock_nperblock =
+                Sequence<mreduce_per_thread, nreduce_per_thread>{};
+            // VGPR c_reduce_thread_desc_mperblock_nperblock
+            constexpr auto c_reduce_thread_desc_mperblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(Number<mreduce_per_thread>{}, Number<nreduce_per_thread>{}));
+            // VGPR d_reduce_thread_desc_mperblock
+            constexpr auto d_reduce_thread_desc_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(Number<mreduce_per_thread>{}));
+            // VGPR d_reduce_thread_desc_mblock_mperblock
+            constexpr auto d_reduce_thread_desc_mblock_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(I1, Number<mreduce_per_thread>{}));
+            auto c_reduce_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                c_reduce_thread_desc_mperblock_nperblock.GetElementSpaceSize());
+            // reduce: threadwise copy from LDS to VGPR
+            constexpr auto c_reduce_thread_cluster_desc = make_cluster_descriptor(
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock{}, Sequence<1, 0>{});
+            const auto c_reduce_thread_cluster_idx =
+                c_reduce_thread_cluster_desc.CalculateBottomIndex(
+                    make_multi_index(get_thread_local_1d_id()));
+            const auto c_reduce_thread_data_idx_begin =
+                c_reduce_thread_cluster_idx * c_reduce_thread_lengths_mperblock_nperblock;
+            auto c_reduce_thread_copy_lds_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatCShuffle,
+                FloatReduceAcc,
+                decltype(c_reduce_block_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_lengths_mperblock_nperblock),
+                Sequence<0, 1>,
+                1,
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>{c_reduce_block_desc_mperblock_nperblock, c_reduce_thread_data_idx_begin};
+            auto dxs_reduce_thread_copy_vgpr_to_global = generate_tuple(
+                [&](auto I) {
+                    auto p_d_grid         = p_ds_grid[I];
+                    auto d_out_element_op = dxs_out_element_op[I];
+                    return ThreadwiseTensorSliceTransfer_v1r3<
+                        FloatReduceAcc,
+                        remove_pointer_t<decltype(p_d_grid)>,
+                        decltype(d_reduce_thread_desc_mblock_mperblock),
+                        decltype(d_grid_desc_mblock_mperblock),
+                        decltype(d_out_element_op),
+                        Sequence<1, mreduce_per_thread>,
+                        Sequence<0, 1>,
+                        1,
+                        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+                        DGlobalMemoryDataOperation::At(I),
+                        1,
+                        false>{d_grid_desc_mblock_mperblock,
+                               make_multi_index(block_work_idx[I0],                  // mblock
+                                                c_reduce_thread_data_idx_begin[I0]), // mperblock
+                               d_out_element_op};
+                },
+                Number<p_ds_grid.Size()>{});
+            // c0 and c1
+            constexpr auto c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1, Number<mreduce_per_thread>{}, I1, Number<nreduce_per_thread>{}));
+            constexpr auto c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock =
+                c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock;
+            auto c01_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+            auto c0_thread_copy_global_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatC0,
+                FloatReduceAcc,
+                decltype(c0_grid_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>,
+                Sequence<0, 1, 2, 3>,
+                3,
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>(
+                c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                make_multi_index(I0,
+                                 m_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I0],
+                                 I0,
+                                 n_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I1]));
+            auto c1_thread_copy_global_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatC1,
+                FloatReduceAcc,
+                decltype(c1_grid_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>,
+                Sequence<0, 1, 2, 3>,
+                3,
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>(
+                c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                make_multi_index(I0,
+                                 m_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I0],
+                                 I0,
+                                 n_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I1]));
+            constexpr auto c_reduce_thread_desc_mblock_mperblock_nblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1, Number<mreduce_per_thread>{}, I1, Number<nreduce_per_thread>{}));
+            auto c_reduce_thread_copy_vgpr_to_global = ThreadwiseTensorSliceTransfer_v1r3<
+                FloatReduceAcc,
+                FloatC,
+                decltype(c_reduce_thread_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                tensor_operation::element_wise::PassThrough,
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>, // SliceLengths
+                Sequence<0, 1, 2, 3>,                                     // DimAccessOrder
+                3,                                                        // DstVectorDim
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+                make_multi_index(I0,
+                                 m_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I0],
+                                 I0,
+                                 n_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I1]),
+                tensor_operation::element_wise::PassThrough{}};
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+                {
+                    c_reduce_thread_copy_lds_to_vgpr.Run(c_reduce_block_desc_mperblock_nperblock,
+                                                         c_shuffle_block_buf,
+                                                         c_reduce_thread_desc_mperblock_nperblock,
+                                                         make_tuple(I0, I0),
+                                                         c_reduce_thread_buf);
+                    c0_thread_copy_global_to_vgpr.Run(
+                        c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                        c0_grid_buf,
+                        c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        c01_thread_buf);
+                    // c = activation(c + bias)
+                    static_for<0, c_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                        [&](auto i) {
+                            FloatReduceAcc out;
+                            c_element_op(out, c_reduce_thread_buf(i) + c01_thread_buf(i));
+                            c_reduce_thread_buf(i) = out;
+                        });
+                    c1_thread_copy_global_to_vgpr.Run(
+                        c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                        c1_grid_buf,
+                        c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        c01_thread_buf);
+                    // c = c + c1_functior(c1)
+                    static_for<0, c_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                        [&](auto i) {
+                            c1_element_op(c01_thread_buf(i), c01_thread_buf(i));
+                            c_reduce_thread_buf(i) += c01_thread_buf(i);
+                        });
+                    c_reduce_thread_copy_vgpr_to_global.Run(
+                        c_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        c_reduce_thread_buf,
+                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                        c_grid_buf);
+                    static_for<0, p_ds_grid.Size(), 1>{}([&](auto In) {
+                        auto& p_d_grid = p_ds_grid[In];
+                        auto d_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                            p_d_grid, d_grid_desc_mblock_mperblock.GetElementSpaceSize());
+                        auto d_thread_buf =
+                            make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                                d_reduce_thread_desc_mperblock.GetElementSpaceSize());
+                        auto& d_in_element_op = dxs_in_element_op[In];
+                        auto& d_reduce_thread_copy_vgpr_to_global =
+                            dxs_reduce_thread_copy_vgpr_to_global(In);
+                        using DReduceOperation = remove_cvref_t<decltype(DxsReduceOperation{}[In])>;
+                        using ThreadwiseReduce =
+                            ThreadwiseReduction<FloatReduceAcc,
+                                                decltype(c_reduce_thread_desc_mperblock_nperblock),
+                                                decltype(d_reduce_thread_desc_mperblock),
+                                                DReduceOperation,
+                                                false>;
+                        // Global write Gemm shuffle + reduction
+                        const auto d_zeroVal =
+                            DReduceOperation::template GetIdentityValue<FloatReduceAcc>();
+                        static_for<0, mreduce_per_thread, 1>{}(
+                            [&](auto I) { d_thread_buf(I) = d_zeroVal; });
+                        // reduce in VGPR
+                        static_for<0, mreduce_per_thread, 1>{}([&](auto im) {
+                            static_for<0, nreduce_per_thread, 1>{}([&](auto in) {
+                                constexpr auto offset =
+                                    Number<c_reduce_thread_desc_mperblock_nperblock.CalculateOffset(
+                                        make_tuple(im, in))>{};
+                                d_in_element_op(c_reduce_thread_buf(offset),
+                                                c_reduce_thread_buf(offset));
+                            });
+                        });
+                        ThreadwiseReduce::Reduce(c_reduce_thread_buf, d_thread_buf);
+                        // copy from VGPR to Global
+                        d_reduce_thread_copy_vgpr_to_global.Run(
+                            d_reduce_thread_desc_mblock_mperblock,
+                            make_tuple(I0, I0),
+                            d_thread_buf,
+                            d_grid_desc_mblock_mperblock,
+                            d_grid_buf);
+                        if constexpr(access_id < num_access - 1)
+                        {
+                            constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+                            d_reduce_thread_copy_vgpr_to_global.MoveDstSliceWindow(
+                                d_grid_desc_mblock_mperblock,
+                                make_tuple(c_global_step[I0], c_global_step[I1]));
+                        }
+                    });
+                }
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+                    // move on C
+                    c_reduce_thread_copy_vgpr_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                    // move on C0
+                    c0_thread_copy_global_to_vgpr.MoveSrcSliceWindow(
+                        c0_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                    // move on C1
+                    c1_thread_copy_global_to_vgpr.MoveSrcSliceWindow(
+                        c1_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        } // Reduction
+    }
+};
+} // namespace ck